Merge branch 'main' into use-openai-for-ollama

2025-12-18 18:49:48 +00:00 · 2025-09-15 15:31:03 -04:00 · 2025-09-15 15:31:03 -04:00 · 91fb6f42cb
commit 91fb6f42cb
parent 7b5685b1d9 01bdcce4d2
74 changed files with 8761 additions and 971 deletions
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -13,11 +13,8 @@ on:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
-      - 'llama_stack/**'
+      - 'docs/_static/llama-stack-spec.yaml'
-      - '!llama_stack/ui/**'
+      - 'docs/_static/llama-stack-spec.html'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/conformance.yml' # This workflow itself
 concurrency:
@ -43,10 +40,27 @@ jobs:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
        id: cache-oasdiff
        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
        with:
          path: ~/oasdiff
          key: oasdiff-${{ runner.os }}
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
        if: steps.cache-oasdiff.outputs.cache-hit != 'true'
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
          cp /usr/local/bin/oasdiff ~/oasdiff
      # Setup cached oasdiff
      - name: Setup cached oasdiff
        if: steps.cache-oasdiff.outputs.cache-hit == 'true'
        run: |
          sudo cp ~/oasdiff /usr/local/bin/oasdiff
          sudo chmod +x /usr/local/bin/oasdiff
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -47,11 +47,21 @@ jobs:
        run: npm ci
        working-directory: llama_stack/ui
-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      - name: Run pre-commit
        id: precommit
        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Check pre-commit results
        if: steps.precommit.outcome == 'failure'
        run: |
          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
          exit 1
      - name: Debug
        run: |
          echo "github.ref: ${{ github.ref }}"
@ -79,17 +89,23 @@ jobs:
            echo "No changes to commit"
          fi
-      - name: Verify if there are any diff files after pre-commit
+      - name: Verify no uncommitted changes
        if: github.actor != 'dependabot[bot]'
        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+          if ! git diff --exit-code; then
            echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
            echo "::warning::Files with changes:"
            git diff --name-status
            exit 1
          fi
      - name: Verify if there are any new files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
            echo "::warning::New files:"
            echo "$unstaged_files"
            exit 1
          fi
--- a/docs/source/distributions/k8s-benchmark/README.md
+++ b/docs/source/distributions/k8s-benchmark/README.md
@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
 **1. Deploy base k8s infrastructure:**
 ```bash
-cd ../k8s
+cd ../../docs/source/distributions/k8s
 ./apply.sh
 ```
 **2. Deploy benchmark components:**
 ```bash
 cd ../k8s-benchmark
 ./apply.sh
 ```
@ -56,7 +55,6 @@ kubectl get pods
 **Benchmark Llama Stack (default):**
 ```bash
 cd docs/source/distributions/k8s-benchmark/
 ./run-benchmark.sh
 ```
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@ -14,7 +14,7 @@ import os
 import random
 import statistics
 import time
-from typing import Tuple
+
 import aiohttp
@ -57,17 +57,9 @@ class BenchmarkStats:
        success_rate = (self.success_count / self.total_requests) * 100
        print(f"\n{'=' * 60}")
-        print(f"BENCHMARK RESULTS")
+        print("BENCHMARK RESULTS")
        print(f"{'='*60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
-        print(f"\nResponse Time Statistics:")
+        print("\nResponse Time Statistics:")
        print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
        print(f"  Median: {statistics.median(self.response_times):.3f}s")
        print(f"  Min: {min(self.response_times):.3f}s")
@ -78,14 +70,14 @@ class BenchmarkStats:
        percentiles = [50, 90, 95, 99]
        sorted_times = sorted(self.response_times)
-        print(f"\nPercentiles:")
+        print("\nPercentiles:")
        for p in percentiles:
            idx = int(len(sorted_times) * p / 100) - 1
            idx = max(0, min(idx, len(sorted_times) - 1))
            print(f"  P{p}: {sorted_times[idx]:.3f}s")
        if self.ttft_times:
-            print(f"\nTime to First Token (TTFT) Statistics:")
+            print("\nTime to First Token (TTFT) Statistics:")
            print(f"  Mean: {statistics.mean(self.ttft_times):.3f}s")
            print(f"  Median: {statistics.median(self.ttft_times):.3f}s")
            print(f"  Min: {min(self.ttft_times):.3f}s")
@ -95,26 +87,35 @@ class BenchmarkStats:
                print(f"  Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
            sorted_ttft = sorted(self.ttft_times)
-            print(f"\nTTFT Percentiles:")
+            print("\nTTFT Percentiles:")
            for p in percentiles:
                idx = int(len(sorted_ttft) * p / 100) - 1
                idx = max(0, min(idx, len(sorted_ttft) - 1))
                print(f"  P{p}: {sorted_ttft[idx]:.3f}s")
        if self.chunks_received:
-            print(f"\nStreaming Statistics:")
+            print("\nStreaming Statistics:")
            print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
            print(f"  Total chunks received: {sum(self.chunks_received)}")
        print(f"{'=' * 60}")
        print(f"Total time: {total_time:.2f}s")
        print(f"Concurrent users: {self.concurrent_users}")
        print(f"Total requests: {self.total_requests}")
        print(f"Successful requests: {self.success_count}")
        print(f"Failed requests: {len(self.errors)}")
        print(f"Success rate: {success_rate:.1f}%")
        print(f"Requests per second: {self.success_count / total_time:.2f}")
        if self.errors:
-            print(f"\nErrors (showing first 5):")
+            print("\nErrors (showing first 5):")
            for error in self.errors[:5]:
                print(f"  {error}")
 class LlamaStackBenchmark:
    def __init__(self, base_url: str, model_id: str):
-        self.base_url = base_url.rstrip('/')
+        self.base_url = base_url.rstrip("/")
        self.model_id = model_id
        self.headers = {"Content-Type": "application/json"}
        self.test_messages = [
@ -125,20 +126,14 @@ class LlamaStackBenchmark:
            [
                {"role": "user", "content": "What is machine learning?"},
                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
-                {"role": "user", "content": "Can you give me a practical example?"}
+                {"role": "user", "content": "Can you give me a practical example?"},
-            ]
+            ],
        ]
-
+    async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
    async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
        """Make a single async streaming chat completion request."""
        messages = random.choice(self.test_messages)
-        payload = {
+        payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
            "model": self.model_id,
            "messages": messages,
            "stream": True,
            "max_tokens": 100
        }
        start_time = time.time()
        chunks_received = 0
@ -152,17 +147,17 @@ class LlamaStackBenchmark:
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=payload,
-                timeout=aiohttp.ClientTimeout(total=30)
+                timeout=aiohttp.ClientTimeout(total=30),
            ) as response:
                if response.status == 200:
                    async for line in response.content:
                        if line:
-                            line_str = line.decode('utf-8').strip()
+                            line_str = line.decode("utf-8").strip()
-                            if line_str.startswith('data: '):
+                            if line_str.startswith("data: "):
                                chunks_received += 1
                                if ttft is None:
                                    ttft = time.time() - start_time
-                                if line_str == 'data: [DONE]':
+                                if line_str == "data: [DONE]":
                                    break
                    if chunks_received == 0:
@ -179,7 +174,6 @@ class LlamaStackBenchmark:
        response_time = time.time() - start_time
        return response_time, chunks_received, ttft, error
    async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
        """Run benchmark using async requests for specified duration."""
        stats = BenchmarkStats()
@ -191,7 +185,7 @@ class LlamaStackBenchmark:
        print(f"Model: {self.model_id}")
        connector = aiohttp.TCPConnector(limit=concurrent_users)
-        async with aiohttp.ClientSession(connector=connector) as session:
+        async with aiohttp.ClientSession(connector=connector):
            async def worker(worker_id: int):
                """Worker that sends requests sequentially until canceled."""
@ -215,7 +209,9 @@ class LlamaStackBenchmark:
                        await asyncio.sleep(1)  # Report every second
                        if time.time() >= last_report_time + 10:  # Report every 10 seconds
                            elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
+                            print(
                                f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
                            )
                            last_report_time = time.time()
                    except asyncio.CancelledError:
                        break
@ -240,14 +236,16 @@ class LlamaStackBenchmark:
 def main():
    parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
-    parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
+    parser.add_argument(
-                       help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
+        "--base-url",
-    parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
+        default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
-                       help="Model ID to use for requests")
+        help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
-    parser.add_argument("--duration", type=int, default=60,
+    )
-                       help="Duration in seconds to run benchmark (default: 60)")
+    parser.add_argument(
-    parser.add_argument("--concurrent", type=int, default=10,
+        "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
-                       help="Number of concurrent users (default: 10)")
+    )
    parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
    parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
    args = parser.parse_args()
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
 - Valid OpenAI-formatted chat completion responses with dynamic content
 """
 from flask import Flask, request, jsonify, Response
 import time
 import random
 import uuid
 import json
 import argparse
 import json
 import os
 import random
 import time
 import uuid
 from flask import Flask, Response, jsonify, request
 app = Flask(__name__)
 # Models from environment variables
 def get_models():
    models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
@ -29,40 +31,72 @@ def get_models():
    return {
        "object": "list",
        "data": [
-            {
+            {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
-                "id": model_id,
+        ],
                "object": "model",
                "created": 1234567890,
                "owned_by": "vllm"
            }
            for model_id in model_ids
        ]
    }
 def generate_random_text(length=50):
    """Generate random but coherent text for responses."""
    words = [
-        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
+        "Hello",
-        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
+        "there",
-        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
+        "I'm",
-        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
+        "an",
        "AI",
        "assistant",
        "ready",
        "to",
        "help",
        "you",
        "with",
        "your",
        "questions",
        "and",
        "tasks",
        "today",
        "Let",
        "me",
        "know",
        "what",
        "you'd",
        "like",
        "to",
        "discuss",
        "or",
        "explore",
        "together",
        "I",
        "can",
        "assist",
        "with",
        "various",
        "topics",
        "including",
        "coding",
        "writing",
        "analysis",
        "and",
        "more",
    ]
    return " ".join(random.choices(words, k=length))
-@app.route('/v1/models', methods=['GET'])
+
@app.route("/v1/models", methods=["GET"])
 def list_models():
    models = get_models()
    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
    return jsonify(models)
-@app.route('/v1/chat/completions', methods=['POST'])
+
@app.route("/v1/chat/completions", methods=["POST"])
 def chat_completions():
    """Return OpenAI-formatted chat completion responses."""
    data = request.get_json()
-    default_model = get_models()['data'][0]['id']
+    default_model = get_models()["data"][0]["id"]
-    model = data.get('model', default_model)
+    model = data.get("model", default_model)
-    messages = data.get('messages', [])
+    messages = data.get("messages", [])
-    stream = data.get('stream', False)
+    stream = data.get("stream", False)
    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
@ -71,11 +105,12 @@ def chat_completions():
    else:
        return handle_non_streaming_completion(model, messages)
 def handle_non_streaming_completion(model, messages):
    response_text = generate_random_text(random.randint(20, 80))
    # Calculate realistic token counts
-    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
+    prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
    completion_tokens = len(response_text.split())
    response = {
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
        "object": "chat.completion",
        "created": int(time.time()),
        "model": model,
-        "choices": [
+        "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
+            "total_tokens": prompt_tokens + completion_tokens,
-        }
+        },
    }
    return jsonify(response)
 def handle_streaming_completion(model, messages):
    def generate_stream():
        # Generate response text
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
                {
                    "index": 0,
                    "delta": {"role": "assistant", "content": ""}
                }
            ]
        }
        yield f"data: {json.dumps(initial_chunk)}\n\n"
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
                "object": "chat.completion.chunk",
                "created": int(time.time()),
                "model": model,
-                "choices": [
+                "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
                    {
                        "index": 0,
                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
                    }
                ]
            }
            yield f"data: {json.dumps(chunk)}\n\n"
            # Configurable delay to simulate realistic streaming
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
            "object": "chat.completion.chunk",
            "created": int(time.time()),
            "model": model,
-            "choices": [
+            "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
                {
                    "index": 0,
                    "delta": {"content": ""},
                    "finish_reason": "stop"
                }
            ]
        }
        yield f"data: {json.dumps(final_chunk)}\n\n"
        yield "data: [DONE]\n\n"
    return Response(
        generate_stream(),
-        mimetype='text/event-stream',
+        mimetype="text/event-stream",
        headers={
-            'Cache-Control': 'no-cache',
+            "Cache-Control": "no-cache",
-            'Connection': 'keep-alive',
+            "Connection": "keep-alive",
-            'Access-Control-Allow-Origin': '*',
+            "Access-Control-Allow-Origin": "*",
-        }
+        },
    )
-@app.route('/health', methods=['GET'])
+
@app.route("/health", methods=["GET"])
 def health():
    return jsonify({"status": "healthy", "type": "openai-mock"})
-if __name__ == '__main__':
+
-    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
+if __name__ == "__main__":
-    parser.add_argument('--port', type=int, default=8081, 
+    parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
-                       help='Port to run the server on (default: 8081)')
+    parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
    args = parser.parse_args()
    port = args.port
@ -187,4 +199,4 @@ if __name__ == '__main__':
    print("- OpenAI-formatted chat/completion responses with dynamic content")
    print("- Streaming support with valid SSE format")
    print(f"- Listening on: http://0.0.0.0:{port}")
-    app.run(host='0.0.0.0', port=port, debug=False)
+    app.run(host="0.0.0.0", port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/profile_running_server.sh
+++ b/docs/source/distributions/k8s-benchmark/profile_running_server.sh
--- a/docs/source/distributions/k8s-benchmark/run-benchmark.sh
+++ b/docs/source/distributions/k8s-benchmark/run-benchmark.sh
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -2,6 +2,7 @@ version: '2'
 image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - files
 - inference
 - files
 - safety
@ -20,6 +21,14 @@ providers:
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
    config: {}
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        type: sqlite
        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
  vector_io:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -1,5 +1,106 @@
@import url("theme.css");
 /* Horizontal Navigation Bar */
 .horizontal-nav {
    background-color: #ffffff;
    border-bottom: 1px solid #e5e5e5;
    padding: 0;
    position: fixed;
    top: 0;
    left: 0;
    right: 0;
    z-index: 1050;
    height: 50px;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
 }
 [data-theme="dark"] .horizontal-nav {
    background-color: #1a1a1a;
    border-bottom: 1px solid #333;
 }
 .horizontal-nav .nav-container {
    max-width: 1200px;
    margin: 0 auto;
    display: flex;
    align-items: center;
    justify-content: space-between;
    padding: 0 20px;
    height: 100%;
 }
 .horizontal-nav .nav-brand {
    font-size: 18px;
    font-weight: 600;
    color: #333;
    text-decoration: none;
 }
 [data-theme="dark"] .horizontal-nav .nav-brand {
    color: #fff;
 }
 .horizontal-nav .nav-links {
    display: flex;
    align-items: center;
    gap: 30px;
    list-style: none;
    margin: 0;
    padding: 0;
 }
 .horizontal-nav .nav-links a {
    color: #666;
    text-decoration: none;
    font-size: 14px;
    font-weight: 500;
    padding: 8px 12px;
    border-radius: 6px;
    transition: all 0.2s ease;
 }
 .horizontal-nav .nav-links a:hover,
 .horizontal-nav .nav-links a.active {
    color: #333;
    background-color: #f5f5f5;
 }
 .horizontal-nav .nav-links a.active {
    font-weight: 600;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a {
    color: #ccc;
 }
 [data-theme="dark"] .horizontal-nav .nav-links a:hover,
 [data-theme="dark"] .horizontal-nav .nav-links a.active {
    color: #fff;
    background-color: #333;
 }
 .horizontal-nav .nav-links .github-link {
    display: flex;
    align-items: center;
    gap: 6px;
 }
 .horizontal-nav .nav-links .github-icon {
    width: 16px;
    height: 16px;
    fill: currentColor;
 }
 /* Adjust main content to account for fixed nav */
 .wy-nav-side {
    top: 50px;
    height: calc(100vh - 50px);
 }
 .wy-nav-content-wrap {
    margin-top: 50px;
 }
 .wy-nav-content {
    max-width: 90%;
 }
--- a/docs/_static/js/horizontal_nav.js
+++ b/docs/_static/js/horizontal_nav.js
@ -0,0 +1,44 @@
 // Horizontal Navigation Bar for Llama Stack Documentation
 document.addEventListener('DOMContentLoaded', function() {
    // Create the horizontal navigation HTML
    const navHTML = `
        <nav class="horizontal-nav">
            <div class="nav-container">
                <a href="/" class="nav-brand">Llama Stack</a>
                <ul class="nav-links">
                    <li><a href="/">Docs</a></li>
                    <li><a href="/references/api_reference/">API Reference</a></li>
                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
                        </svg>
                        GitHub
                    </a></li>
                </ul>
            </div>
        </nav>
    `;
    // Insert the navigation at the beginning of the body
    document.body.insertAdjacentHTML('afterbegin', navHTML);
    // Update navigation links based on current page
    updateActiveNav();
 });
 function updateActiveNav() {
    const currentPath = window.location.pathname;
    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
    navLinks.forEach(link => {
        // Remove any existing active classes
        link.classList.remove('active');
        // Add active class based on current path
        if (currentPath === '/' && link.getAttribute('href') === '/') {
            link.classList.add('active');
        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
            link.classList.add('active');
        }
    });
 }
--- a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
+++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb
@ -0,0 +1,701 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1ztegmwm4sp",
   "metadata": {},
   "source": [
    "## LlamaStack + LangChain Integration Tutorial\n",
    "\n",
    "This notebook demonstrates how to integrate **LlamaStack** with **LangChain** to build a complete RAG (Retrieval-Augmented Generation) system.\n",
    "\n",
    "### Overview\n",
    "\n",
    "- **LlamaStack**: Provides the infrastructure for running LLMs and Open AI Compatible Vector Stores\n",
    "- **LangChain**: Provides the framework for chaining operations and prompt templates\n",
    "- **Integration**: Uses LlamaStack's OpenAI-compatible API with LangChain\n",
    "\n",
    "### What You'll See\n",
    "\n",
    "1. Setting up LlamaStack server with Fireworks AI provider\n",
    "2. Creating and Querying Vector Stores\n",
    "3. Building RAG chains with LangChain + LLAMAStack\n",
    "4. Querying the chain for relevant information\n",
    "\n",
    "### Prerequisites\n",
    "\n",
    "- Fireworks API key\n",
    "\n",
    "---\n",
    "\n",
    "### 1. Installation and Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ktr5ls2cas",
   "metadata": {},
   "source": [
    "#### Install Required Dependencies\n",
    "\n",
    "First, we install all the necessary packages for LangChain and FastAPI integration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5b6a6a17-b931-4bea-8273-0d6e5563637a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: uv in /Users/swapna942/miniconda3/lib/python3.12/site-packages (0.7.20)\n",
      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m7 packages\u001b[0m \u001b[2min 42ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install uv\n",
    "!uv pip install fastapi uvicorn \"langchain>=0.2\" langchain-openai \\\n",
    "             langchain-community langchain-text-splitters \\\n",
    "             faiss-cpu"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "wmt9jvqzh7n",
   "metadata": {},
   "source": [
    "### 2. LlamaStack Server Setup\n",
    "\n",
    "#### Build and Start LlamaStack Server\n",
    "\n",
    "This section sets up the LlamaStack server with:\n",
    "- **Fireworks AI** as the inference provider\n",
    "- **Sentence Transformers** for embeddings\n",
    "\n",
    "The server runs on `localhost:8321` and provides OpenAI-compatible endpoints."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd2dacf3-ec8b-4cc7-8ff4-b5b6ea4a6e9e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
    "import time\n",
    "\n",
    "# Remove UV_SYSTEM_PYTHON to ensure uv creates a proper virtual environment\n",
    "# instead of trying to use system Python globally, which could cause permission issues\n",
    "# and package conflicts with the system's Python installation\n",
    "if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
    "    del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
    "\n",
    "def run_llama_stack_server_background():\n",
    "    \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
    "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
    "    process = subprocess.Popen(\n",
    "        \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
    "        shell=True,\n",
    "        stdout=log_file,\n",
    "        stderr=log_file,\n",
    "        text=True,\n",
    "    )\n",
    "\n",
    "    print(f\"Building and starting Llama Stack server with PID: {process.pid}\")\n",
    "    return process\n",
    "\n",
    "\n",
    "def wait_for_server_to_start():\n",
    "    import requests\n",
    "    from requests.exceptions import ConnectionError\n",
    "\n",
    "    url = \"http://0.0.0.0:8321/v1/health\"\n",
    "    max_retries = 30\n",
    "    retry_interval = 1\n",
    "\n",
    "    print(\"Waiting for server to start\", end=\"\")\n",
    "    for _ in range(max_retries):\n",
    "        try:\n",
    "            response = requests.get(url)\n",
    "            if response.status_code == 200:\n",
    "                print(\"\\nServer is ready!\")\n",
    "                return True\n",
    "        except ConnectionError:\n",
    "            print(\".\", end=\"\", flush=True)\n",
    "            time.sleep(retry_interval)\n",
    "\n",
    "    print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
    "    return False\n",
    "\n",
    "\n",
    "def kill_llama_stack_server():\n",
    "    # Kill any existing llama stack server processes using pkill command\n",
    "    os.system(\"pkill -f llama_stack.core.server.server\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "28bd8dbd-4576-4e76-813f-21ab94db44a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building and starting Llama Stack server with PID: 19747\n",
      "Waiting for server to start....\n",
      "Server is ready!\n"
     ]
    }
   ],
   "source": [
    "server_process = run_llama_stack_server_background()\n",
    "assert wait_for_server_to_start()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "gr9cdcg4r7n",
   "metadata": {},
   "source": [
    "#### Install LlamaStack Client\n",
    "\n",
    "Install the client library to interact with the LlamaStack server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "487d2dbc-d071-400e-b4f0-dcee58f8dc95",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 27ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!uv pip install llama_stack_client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0j5hag7l9x89",
   "metadata": {},
   "source": [
    "### 3. Initialize LlamaStack Client\n",
    "\n",
    "Create a client connection to the LlamaStack server with API keys for different providers:\n",
    "\n",
    "- **Fireworks API Key**: For Fireworks models\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ab4eff97-4565-4c73-b1b3-0020a4c7e2a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_stack_client import LlamaStackClient\n",
    "\n",
    "client = LlamaStackClient(\n",
    "    base_url=\"http://0.0.0.0:8321\",\n",
    "    provider_data={\"fireworks_api_key\": \"***\"},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "vwhexjy1e8o",
   "metadata": {},
   "source": [
    "#### Explore Available Models and Safety Features\n",
    "\n",
    "Check what models and safety shields are available through your LlamaStack instance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "880443ef-ac3c-48b1-a80a-7dab5b25ac61",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/shields \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Available Fireworks models:\n",
      "- fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p1-70b-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p1-405b-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p2-3b-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p2-11b-vision-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct\n",
      "- fireworks/accounts/fireworks/models/llama-v3p3-70b-instruct\n",
      "- fireworks/accounts/fireworks/models/llama4-scout-instruct-basic\n",
      "- fireworks/accounts/fireworks/models/llama4-maverick-instruct-basic\n",
      "- fireworks/nomic-ai/nomic-embed-text-v1.5\n",
      "- fireworks/accounts/fireworks/models/llama-guard-3-8b\n",
      "- fireworks/accounts/fireworks/models/llama-guard-3-11b-vision\n",
      "----\n",
      "Available shields (safety models):\n",
      "code-scanner\n",
      "llama-guard\n",
      "nemo-guardrail\n",
      "----\n"
     ]
    }
   ],
   "source": [
    "print(\"Available Fireworks models:\")\n",
    "for m in client.models.list():\n",
    "    if m.identifier.startswith(\"fireworks/\"):\n",
    "        print(f\"- {m.identifier}\")\n",
    "\n",
    "print(\"----\")\n",
    "print(\"Available shields (safety models):\")\n",
    "for s in client.shields.list():\n",
    "    print(s.identifier)\n",
    "print(\"----\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "gojp7at31ht",
   "metadata": {},
   "source": [
    "### 4. Vector Store Setup\n",
    "\n",
    "#### Create a Vector Store with File Upload\n",
    "\n",
    "Create a vector store using the OpenAI-compatible vector stores API:\n",
    "\n",
    "- **Vector Store**: OpenAI-compatible vector store for document storage\n",
    "- **File Upload**: Automatic chunking and embedding of uploaded files  \n",
    "- **Embedding Model**: Sentence Transformers model for text embeddings\n",
    "- **Dimensions**: 384-dimensional embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "be2c2899-ea53-4e5f-b6b8-ed425f5d6572",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File(id='file-54652c95c56c4c34918a97d7ff8a4320', bytes=41, created_at=1757442621, expires_at=1788978621, filename='shipping_policy.txt', object='file', purpose='assistants')\n",
      "File(id='file-fb1227c1d1854da1bd774d21e5b7e41c', bytes=48, created_at=1757442621, expires_at=1788978621, filename='returns_policy.txt', object='file', purpose='assistants')\n",
      "File(id='file-673f874852fe42798675a13d06a256e2', bytes=45, created_at=1757442621, expires_at=1788978621, filename='support.txt', object='file', purpose='assistants')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores \"HTTP/1.1 200 OK\"\n"
     ]
    }
   ],
   "source": [
    "from io import BytesIO\n",
    "\n",
    "docs = [\n",
    "    (\"Acme ships globally in 3-5 business days.\", {\"title\": \"Shipping Policy\"}),\n",
    "    (\"Returns are accepted within 30 days of purchase.\", {\"title\": \"Returns Policy\"}),\n",
    "    (\"Support is available 24/7 via chat and email.\", {\"title\": \"Support\"}),\n",
    "]\n",
    "\n",
    "file_ids = []\n",
    "for content, metadata in docs:\n",
    "  with BytesIO(content.encode()) as file_buffer:\n",
    "      file_buffer.name = f\"{metadata['title'].replace(' ', '_').lower()}.txt\"\n",
    "      create_file_response = client.files.create(file=file_buffer, purpose=\"assistants\")\n",
    "      print(create_file_response)\n",
    "      file_ids.append(create_file_response.id)\n",
    "\n",
    "# Create vector store with files\n",
    "vector_store = client.vector_stores.create(\n",
    "  name=\"acme_docs\",\n",
    "  file_ids=file_ids,\n",
    "  embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
    "  embedding_dimension=384,\n",
    "  provider_id=\"faiss\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9061tmi1zpq",
   "metadata": {},
   "source": [
    "#### Test Vector Store Search\n",
    "\n",
    "Query the vector store. This performs semantic search to find relevant documents based on the query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ba9d1901-bd5e-4216-b3e6-19dc74551cc6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Acme ships globally in 3-5 business days.\n",
      "Returns are accepted within 30 days of purchase.\n"
     ]
    }
   ],
   "source": [
    "search_response = client.vector_stores.search(\n",
    "  vector_store_id=vector_store.id,\n",
    "  query=\"How long does shipping take?\",\n",
    "  max_num_results=2\n",
    ")\n",
    "for result in search_response.data:\n",
    "  content = result.content[0].text\n",
    "  print(content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "usne6mbspms",
   "metadata": {},
   "source": [
    "### 5. LangChain Integration\n",
    "\n",
    "#### Configure LangChain with LlamaStack\n",
    "\n",
    "Set up LangChain to use LlamaStack's OpenAI-compatible API:\n",
    "\n",
    "- **Base URL**: Points to LlamaStack's OpenAI endpoint\n",
    "- **Headers**: Include Fireworks API key for model access\n",
    "- **Model**: Use Meta Llama v3p1 8b instruct model for inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c378bd10-09c2-417c-bdfc-1e0a2dd19084",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "# Point LangChain to Llamastack Server\n",
    "llm = ChatOpenAI(\n",
    "    base_url=\"http://0.0.0.0:8321/v1/openai/v1\",\n",
    "    api_key=\"dummy\",\n",
    "    model=\"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\",\n",
    "    default_headers={\"X-LlamaStack-Provider-Data\": '{\"fireworks_api_key\": \"***\"}'},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a4ddpcuk3l",
   "metadata": {},
   "source": [
    "#### Test LLM Connection\n",
    "\n",
    "Verify that LangChain can successfully communicate with the LlamaStack server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f88ffb5a-657b-4916-9375-c6ddc156c25e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AIMessage(content=\"A llama's gentle eyes shine bright,\\nIn the Andes, it roams through morning light.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': None, 'model_name': 'fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct', 'system_fingerprint': None, 'id': 'chatcmpl-602b5967-82a3-476b-9cd2-7d3b29b76ee8', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--0933c465-ff4d-4a7b-b7fb-fd97dd8244f3-0')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Test llm with simple message\n",
    "messages = [\n",
    "    {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
    "    {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
    "]\n",
    "llm.invoke(messages)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0xh0jg6a0l4a",
   "metadata": {},
   "source": [
    "### 6. Building the RAG Chain\n",
    "\n",
    "#### Create a Complete RAG Pipeline\n",
    "\n",
    "Build a LangChain pipeline that combines:\n",
    "\n",
    "1. **Vector Search**: Query LlamaStack's Open AI compatible Vector Store\n",
    "2. **Context Assembly**: Format retrieved documents\n",
    "3. **Prompt Template**: Structure the input for the LLM\n",
    "4. **LLM Generation**: Generate answers using context\n",
    "5. **Output Parsing**: Extract the final response\n",
    "\n",
    "**Chain Flow**: `Query → Vector Search → Context + Question → LLM → Response`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9684427d-dcc7-4544-9af5-8b110d014c42",
   "metadata": {},
   "outputs": [],
   "source": [
    "# LangChain for prompt template and chaining + LLAMA Stack Client Vector DB and LLM chat completion\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.prompts import ChatPromptTemplate\n",
    "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
    "\n",
    "\n",
    "def join_docs(docs):\n",
    "    return \"\\n\\n\".join([f\"[{d.filename}] {d.content[0].text}\" for d in docs.data])\n",
    "\n",
    "PROMPT = ChatPromptTemplate.from_messages(\n",
    "    [\n",
    "        (\"system\", \"You are a helpful assistant. Use the following context to answer.\"),\n",
    "        (\"user\", \"Question: {question}\\n\\nContext:\\n{context}\"),\n",
    "    ]\n",
    ")\n",
    "\n",
    "vector_step = RunnableLambda(\n",
    "      lambda x: client.vector_stores.search(\n",
    "          vector_store_id=vector_store.id,\n",
    "          query=x,\n",
    "          max_num_results=2\n",
    "      )\n",
    "  )\n",
    "\n",
    "chain = (\n",
    "    {\"context\": vector_step | RunnableLambda(join_docs), \"question\": RunnablePassthrough()}\n",
    "    | PROMPT\n",
    "    | llm\n",
    "    | StrOutputParser()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0onu6rhphlra",
   "metadata": {},
   "source": [
    "### 7. Testing the RAG System\n",
    "\n",
    "#### Example 1: Shipping Query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "03322188-9509-446a-a4a8-ce3bb83ec87c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "❓ How long does shipping take?\n",
      "💡 Acme ships globally in 3-5 business days. This means that shipping typically takes between 3 to 5 working days from the date of dispatch or order fulfillment.\n"
     ]
    }
   ],
   "source": [
    "query = \"How long does shipping take?\"\n",
    "response = chain.invoke(query)\n",
    "print(\"❓\", query)\n",
    "print(\"💡\", response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7krhqj88ku",
   "metadata": {},
   "source": [
    "#### Example 2: Returns Policy Query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "61995550-bb0b-46a8-a5d0-023207475d60",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
      "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "❓ Can I return a product after 40 days?\n",
      "💡 Based on the provided context, you cannot return a product after 40 days. The return window is limited to 30 days from the date of purchase.\n"
     ]
    }
   ],
   "source": [
    "query = \"Can I return a product after 40 days?\"\n",
    "response = chain.invoke(query)\n",
    "print(\"❓\", query)\n",
    "print(\"💡\", response)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "h4w24fadvjs",
   "metadata": {},
   "source": [
    "---\n",
    "We have successfully built a RAG system that combines:\n",
    "\n",
    "- **LlamaStack** for infrastructure (LLM serving + Vector Store)\n",
    "- **LangChain** for orchestration (prompts + chains)\n",
    "- **Fireworks** for high-quality language models\n",
    "\n",
    "### Key Benefits\n",
    "\n",
    "1. **Unified Infrastructure**: Single server for LLMs and Vector Store\n",
    "2. **OpenAI Compatibility**: Easy integration with existing LangChain code\n",
    "3. **Multi-Provider Support**: Switch between different LLM providers\n",
    "4. **Production Ready**: Built-in safety shields and monitoring\n",
    "\n",
    "### Next Steps\n",
    "\n",
    "- Add more sophisticated document processing\n",
    "- Implement conversation memory\n",
    "- Add safety filtering and monitoring\n",
    "- Scale to larger document collections\n",
    "- Integrate with web frameworks like FastAPI or Streamlit\n",
    "\n",
    "---\n",
    "\n",
    "##### 🔧 Cleanup\n",
    "\n",
    "Don't forget to stop the LlamaStack server when you're done:\n",
    "\n",
    "```python\n",
    "kill_llama_stack_server()\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "15647c46-22ce-4698-af3f-8161329d8e3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "kill_llama_stack_server()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -93,10 +93,31 @@ chunks_response = client.vector_io.query(
 ### Using the RAG Tool
 > **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search
 > API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
 A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
 and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
 [appendix](#more-ragdocument-examples).
 #### OpenAI API Integration & Migration
 The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
 - **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
 - **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
 - **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
 **Migration Path:**
 We recommend migrating to the OpenAI-compatible Search API for:
 1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
 2**Future-Proof**: Continued support and feature development
 3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
 The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes.
 However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any
 documents  fail to process, they will be logged in the response but will not cause the entire operation to fail.
 ```python
 from llama_stack_client import RAGDocument
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -131,6 +131,7 @@ html_static_path = ["../_static"]
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
    app.add_js_file("js/horizontal_nav.js")
    app.add_js_file("js/keyboard_shortcuts.js")
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -35,5 +35,5 @@ testing/record-replay
 ### Benchmarking
-```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
+```{include} ../../../benchmarking/k8s-benchmark/README.md
 ```
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere
 inline_meta-reference
 inline_sentence-transformers
 remote_anthropic
 remote_azure
 remote_bedrock
 remote_cerebras
 remote_databricks
--- a/docs/source/providers/inference/remote_azure.md
+++ b/docs/source/providers/inference/remote_azure.md
@ -0,0 +1,29 @@
 # remote::azure
 ## Description
 Azure OpenAI inference provider for accessing GPT models and other Azure services.
 Provider documentation
 https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Azure API key for Azure |
 | `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
 ## Sample Configuration
 ```yaml
 api_key: ${env.AZURE_API_KEY:=}
 api_base: ${env.AZURE_API_BASE:=}
 api_version: ${env.AZURE_API_VERSION:=}
 api_type: ${env.AZURE_API_TYPE:=}
 ```
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
    parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
-def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
+def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
-    # NOTE: MD5 is used here only for download integrity verification,
+    sha256_hash = hashlib.sha256()
    # not for security purposes
    # TODO: switch to SHA256
    md5_hash = hashlib.md5(usedforsecurity=False)
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
-            md5_hash.update(chunk)
+            sha256_hash.update(chunk)
-    return md5_hash.hexdigest()
+    return sha256_hash.hexdigest()
 def load_checksums(checklist_path: Path) -> dict[str, str]:
@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]:
    with open(checklist_path) as f:
        for line in f:
            if line.strip():
-                md5sum, filepath = line.strip().split("  ", 1)
+                sha256sum, filepath = line.strip().split("  ", 1)
                # Remove leading './' if present
                filepath = filepath.lstrip("./")
-                checksums[filepath] = md5sum
+                checksums[filepath] = sha256sum
    return checksums
@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
            matches = False
            if exists:
-                actual_hash = calculate_md5(full_path)
+                actual_hash = calculate_sha256(full_path)
                matches = actual_hash == expected_hash
            results.append(
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -431,6 +431,12 @@ class ServerConfig(BaseModel):
    )
 class InferenceStoreConfig(BaseModel):
    sql_store_config: SqlStoreConfig
    max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
    num_writers: int = Field(default=4, description="Number of concurrent background writers")
 class StackRunConfig(BaseModel):
    version: int = LLAMA_STACK_RUN_CONFIG_VERSION
@ -464,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
 a default SQLite store will be used.""",
    )
-    inference_store: SqlStoreConfig | None = Field(
+    inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
        default=None,
        description="""
-Configuration for the persistence store used by the inference API. If not specified,
+Configuration for the persistence store used by the inference API. Can be either a
-a default SQLite store will be used.""",
+InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
 If not specified, a default SQLite store will be used.""",
    )
    # registry of "resources" in the distribution
--- a/llama_stack/core/routers/init.py
+++ b/llama_stack/core/routers/init.py
@ -78,7 +78,10 @@ async def get_auto_router_impl(
    # TODO: move pass configs to routers instead
    if api == Api.inference and run_config.inference_store:
-        inference_store = InferenceStore(run_config.inference_store, policy)
+        inference_store = InferenceStore(
            config=run_config.inference_store,
            policy=policy,
        )
        await inference_store.initialize()
        api_to_dep_impl["store"] = inference_store
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
-from llama_stack.providers.utils.telemetry.tracing import get_current_span
+from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
 logger = get_logger(name=__name__, category="core::routers")
@ -90,6 +90,11 @@ class InferenceRouter(Inference):
    async def shutdown(self) -> None:
        logger.debug("InferenceRouter.shutdown")
        if self.store:
            try:
                await self.store.shutdown()
            except Exception as e:
                logger.warning(f"Error during InferenceStore shutdown: {e}")
    async def register_model(
        self,
@ -160,7 +165,7 @@ class InferenceRouter(Inference):
        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
        if self.telemetry:
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
    async def _count_tokens(
@ -431,7 +436,7 @@ class InferenceRouter(Inference):
                model=model_obj,
            )
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
@ -537,7 +542,7 @@ class InferenceRouter(Inference):
                model=model_obj,
            )
            for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
            # these metrics will show up in the client response.
            response.metrics = (
                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@ -664,7 +669,7 @@ class InferenceRouter(Inference):
                            "completion_tokens",
                            "total_tokens",
                        ]:  # Only log completion and total tokens
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
                        # Return metrics in response
                        async_metrics = [
@ -710,7 +715,7 @@ class InferenceRouter(Inference):
            )
            for metric in completion_metrics:
                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    await self.telemetry.log_event(metric)
+                    enqueue_event(metric)
            # Return metrics in response
            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@ -806,7 +811,7 @@ class InferenceRouter(Inference):
                            model=model,
                        )
                        for metric in metrics:
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
                yield chunk
        finally:
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@ -17,6 +17,7 @@ distribution_spec:
    - provider_type: remote::vertexai
    - provider_type: remote::groq
    - provider_type: remote::sambanova
    - provider_type: remote::azure
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: inline::faiss
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -81,6 +81,13 @@ providers:
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@ -18,6 +18,7 @@ distribution_spec:
    - provider_type: remote::vertexai
    - provider_type: remote::groq
    - provider_type: remote::sambanova
    - provider_type: remote::azure
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: inline::faiss
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -81,6 +81,13 @@ providers:
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@ -18,6 +18,7 @@ distribution_spec:
    - provider_type: remote::vertexai
    - provider_type: remote::groq
    - provider_type: remote::sambanova
    - provider_type: remote::azure
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: inline::faiss
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -81,6 +81,13 @@ providers:
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -59,6 +59,7 @@ ENABLED_INFERENCE_PROVIDERS = [
    "cerebras",
    "nvidia",
    "bedrock",
    "azure",
 ]
 INFERENCE_PROVIDER_IDS = {
@ -68,6 +69,7 @@ INFERENCE_PROVIDER_IDS = {
    "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
    "nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
    "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
    "azure": "${env.AZURE_API_KEY:+azure}",
 }
@ -277,5 +279,21 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                "http://localhost:11434",
                "Ollama URL",
            ),
            "AZURE_API_KEY": (
                "",
                "Azure API Key",
            ),
            "AZURE_API_BASE": (
                "",
                "Azure API Base",
            ),
            "AZURE_API_VERSION": (
                "",
                "Azure API Version",
            ),
            "AZURE_API_TYPE": (
                "azure",
                "Azure API Type",
            ),
        },
    )
--- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
@ -8,7 +8,7 @@
 from jinja2 import Template
 from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.inference import UserMessage
+from llama_stack.apis.inference import OpenAIUserMessageParam
 from llama_stack.apis.tools.rag_tool import (
    DefaultRAGQueryGeneratorConfig,
    LLMRAGQueryGeneratorConfig,
@ -61,16 +61,16 @@ async def llm_rag_query_generator(
        messages = [interleaved_content_as_str(content)]
    template = Template(config.template)
-    content = template.render({"messages": messages})
+    rendered_content: str = template.render({"messages": messages})
    model = config.model
-    message = UserMessage(content=content)
+    message = OpenAIUserMessageParam(content=rendered_content)
-    response = await inference_api.chat_completion(
+    response = await inference_api.openai_chat_completion(
-        model_id=model,
+        model=model,
        messages=[message],
        stream=False,
    )
-    query = response.completion_message.content
+    query = response.choices[0].message.content
    return query
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -45,10 +45,7 @@ from llama_stack.apis.vector_io import (
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
-from llama_stack.providers.utils.memory.vector_store import (
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
    content_from_doc,
    parse_data_url,
 )
 from .config import RagToolRuntimeConfig
 from .context_retriever import generate_rag_query
@ -60,6 +57,47 @@ def make_random_string(length: int = 8):
    return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
 async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
    """Get raw binary data and mime type from a RAGDocument for file upload."""
    if isinstance(doc.content, URL):
        if doc.content.uri.startswith("data:"):
            parts = parse_data_url(doc.content.uri)
            mime_type = parts["mimetype"]
            data = parts["data"]
            if parts["is_base64"]:
                file_data = base64.b64decode(data)
            else:
                file_data = data.encode("utf-8")
            return file_data, mime_type
        else:
            async with httpx.AsyncClient() as client:
                r = await client.get(doc.content.uri)
                r.raise_for_status()
                mime_type = r.headers.get("content-type", "application/octet-stream")
                return r.content, mime_type
    else:
        if isinstance(doc.content, str):
            content_str = doc.content
        else:
            content_str = interleaved_content_as_str(doc.content)
        if content_str.startswith("data:"):
            parts = parse_data_url(content_str)
            mime_type = parts["mimetype"]
            data = parts["data"]
            if parts["is_base64"]:
                file_data = base64.b64decode(data)
            else:
                file_data = data.encode("utf-8")
            return file_data, mime_type
        else:
            return content_str.encode("utf-8"), "text/plain"
 class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
    def __init__(
        self,
@ -95,20 +133,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
            return
        for doc in documents:
-            if isinstance(doc.content, URL):
+            try:
-                if doc.content.uri.startswith("data:"):
+                try:
-                    parts = parse_data_url(doc.content.uri)
+                    file_data, mime_type = await raw_data_from_doc(doc)
-                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
+                except Exception as e:
-                    mime_type = parts["mimetype"]
+                    log.error(f"Failed to extract content from document {doc.document_id}: {e}")
-                else:
+                    continue
                    async with httpx.AsyncClient() as client:
                        response = await client.get(doc.content.uri)
                        file_data = response.content
                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
            else:
                content_str = await content_from_doc(doc)
                file_data = content_str.encode("utf-8")
                mime_type = doc.mime_type or "text/plain"
                file_extension = mimetypes.guess_extension(mime_type) or ".txt"
                filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
@ -118,9 +148,13 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                upload_file = UploadFile(file=file_obj, filename=filename)
                try:
                    created_file = await self.files_api.openai_upload_file(
                        file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
                    )
                except Exception as e:
                    log.error(f"Failed to upload file for document {doc.document_id}: {e}")
                    continue
                chunking_strategy = VectorStoreChunkingStrategyStatic(
                    static=VectorStoreChunkingStrategyStaticConfig(
@ -129,12 +163,22 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                    )
                )
                try:
                    await self.vector_io_api.openai_attach_file_to_vector_store(
                        vector_store_id=vector_db_id,
                        file_id=created_file.id,
                        attributes=doc.metadata,
                        chunking_strategy=chunking_strategy,
                    )
                except Exception as e:
                    log.error(
                        f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
                    )
                    continue
            except Exception as e:
                log.error(f"Unexpected error processing document {doc.document_id}: {e}")
                continue
    async def query(
        self,
@ -274,7 +318,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        if query_config:
            query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
        else:
            # handle someone passing an empty dict
            query_config = RAGQueryConfig()
        query = kwargs["query"]
@ -285,6 +328,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        )
        return ToolInvocationResult(
-            content=result.content,
+            content=result.content or [],
            metadata=result.metadata,
        )
--- a/llama_stack/providers/registry/batches.py
+++ b/llama_stack/providers/registry/batches.py
@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.batches,
            provider_type="inline::reference",
-            pip_packages=["openai"],
+            pip_packages=[],
            module="llama_stack.providers.inline.batches.reference",
            config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
            api_dependencies=[
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="vllm",
-                pip_packages=["openai"],
+                pip_packages=[],
                module="llama_stack.providers.remote.inference.vllm",
                config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
                description="Remote vLLM inference provider for connecting to vLLM servers.",
@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="databricks",
-                pip_packages=[
+                pip_packages=[],
                    "openai",
                ],
                module="llama_stack.providers.remote.inference.databricks",
                config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
                description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="nvidia",
-                pip_packages=[
+                pip_packages=[],
                    "openai",
                ],
                module="llama_stack.providers.remote.inference.nvidia",
                config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
                description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="runpod",
-                pip_packages=["openai"],
+                pip_packages=[],
                module="llama_stack.providers.remote.inference.runpod",
                config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
                description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@ -207,7 +203,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="gemini",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                module="llama_stack.providers.remote.inference.gemini",
                config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -218,7 +214,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="vertexai",
-                pip_packages=["litellm", "google-cloud-aiplatform", "openai"],
+                pip_packages=["litellm", "google-cloud-aiplatform"],
                module="llama_stack.providers.remote.inference.vertexai",
                config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
                provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
@ -248,7 +244,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="groq",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                module="llama_stack.providers.remote.inference.groq",
                config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +266,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="sambanova",
-                pip_packages=["litellm", "openai"],
+                pip_packages=["litellm"],
                module="llama_stack.providers.remote.inference.sambanova",
                config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -299,4 +295,19 @@ Available Models:
                description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
            ),
        ),
        remote_provider_spec(
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="azure",
                pip_packages=["litellm"],
                module="llama_stack.providers.remote.inference.azure",
                config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
                provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
                description="""
 Azure OpenAI inference provider for accessing GPT models and other Azure services.
 Provider documentation
 https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 """,
            ),
        ),
    ]
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.scoring,
            provider_type="inline::braintrust",
-            pip_packages=["autoevals", "openai"],
+            pip_packages=["autoevals"],
            module="llama_stack.providers.inline.scoring.braintrust",
            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
            api_dependencies=[
--- a/llama_stack/providers/remote/inference/azure/init.py
+++ b/llama_stack/providers/remote/inference/azure/init.py
@ -0,0 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .config import AzureConfig
 async def get_adapter_impl(config: AzureConfig, _deps):
    from .azure import AzureInferenceAdapter
    impl = AzureInferenceAdapter(config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/azure/azure.py
+++ b/llama_stack/providers/remote/inference/azure/azure.py
@ -0,0 +1,64 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any
 from urllib.parse import urljoin
 from llama_stack.apis.inference import ChatCompletionRequest
 from llama_stack.providers.utils.inference.litellm_openai_mixin import (
    LiteLLMOpenAIMixin,
 )
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import AzureConfig
 from .models import MODEL_ENTRIES
 class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: AzureConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
            litellm_provider_name="azure",
            api_key_from_config=config.api_key.get_secret_value(),
            provider_data_api_key_field="azure_api_key",
            openai_compat_api_base=str(config.api_base),
        )
        self.config = config
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        """
        Get the Azure API base URL.
        Returns the Azure API base URL from the configuration.
        """
        return urljoin(str(self.config.api_base), "/openai/v1")
    async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
        # Get base parameters from parent
        params = await super()._get_params(request)
        # Add Azure specific parameters
        provider_data = self.get_request_provider_data()
        if provider_data:
            if getattr(provider_data, "azure_api_key", None):
                params["api_key"] = provider_data.azure_api_key
            if getattr(provider_data, "azure_api_base", None):
                params["api_base"] = provider_data.azure_api_base
            if getattr(provider_data, "azure_api_version", None):
                params["api_version"] = provider_data.azure_api_version
            if getattr(provider_data, "azure_api_type", None):
                params["api_type"] = provider_data.azure_api_type
        else:
            params["api_key"] = self.config.api_key.get_secret_value()
            params["api_base"] = str(self.config.api_base)
            params["api_version"] = self.config.api_version
            params["api_type"] = self.config.api_type
        return params
--- a/llama_stack/providers/remote/inference/azure/config.py
+++ b/llama_stack/providers/remote/inference/azure/config.py
@ -0,0 +1,63 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from typing import Any
 from pydantic import BaseModel, Field, HttpUrl, SecretStr
 from llama_stack.schema_utils import json_schema_type
 class AzureProviderDataValidator(BaseModel):
    azure_api_key: SecretStr = Field(
        description="Azure API key for Azure",
    )
    azure_api_base: HttpUrl = Field(
        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
    )
    azure_api_version: str | None = Field(
        default=None,
        description="Azure API version for Azure (e.g., 2024-06-01)",
    )
    azure_api_type: str | None = Field(
        default="azure",
        description="Azure API type for Azure (e.g., azure)",
    )
@json_schema_type
 class AzureConfig(BaseModel):
    api_key: SecretStr = Field(
        description="Azure API key for Azure",
    )
    api_base: HttpUrl = Field(
        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
    )
    api_version: str | None = Field(
        default_factory=lambda: os.getenv("AZURE_API_VERSION"),
        description="Azure API version for Azure (e.g., 2024-12-01-preview)",
    )
    api_type: str | None = Field(
        default_factory=lambda: os.getenv("AZURE_API_TYPE", "azure"),
        description="Azure API type for Azure (e.g., azure)",
    )
    @classmethod
    def sample_run_config(
        cls,
        api_key: str = "${env.AZURE_API_KEY:=}",
        api_base: str = "${env.AZURE_API_BASE:=}",
        api_version: str = "${env.AZURE_API_VERSION:=}",
        api_type: str = "${env.AZURE_API_TYPE:=}",
        **kwargs,
    ) -> dict[str, Any]:
        return {
            "api_key": api_key,
            "api_base": api_base,
            "api_version": api_version,
            "api_type": api_type,
        }
--- a/llama_stack/providers/remote/inference/azure/models.py
+++ b/llama_stack/providers/remote/inference/azure/models.py
@ -0,0 +1,28 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
 )
 # https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions
 LLM_MODEL_IDS = [
    "gpt-5",
    "gpt-5-mini",
    "gpt-5-nano",
    "gpt-5-chat",
    "o1",
    "o1-mini",
    "o3-mini",
    "o4-mini",
    "gpt-4.1",
    "gpt-4.1-mini",
    "gpt-4.1-nano",
 ]
 SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
 MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -53,6 +53,43 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .models import MODEL_ENTRIES
 REGION_PREFIX_MAP = {
    "us": "us.",
    "eu": "eu.",
    "ap": "ap.",
 }
 def _get_region_prefix(region: str | None) -> str:
    # AWS requires region prefixes for inference profiles
    if region is None:
        return "us."  # default to US when we don't know
    # Handle case insensitive region matching
    region_lower = region.lower()
    for prefix in REGION_PREFIX_MAP:
        if region_lower.startswith(f"{prefix}-"):
            return REGION_PREFIX_MAP[prefix]
    # Fallback to US for anything we don't recognize
    return "us."
 def _to_inference_profile_id(model_id: str, region: str = None) -> str:
    # Return ARNs unchanged
    if model_id.startswith("arn:"):
        return model_id
    # Return inference profile IDs that already have regional prefixes
    if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
        return model_id
    # Default to US East when no region is provided
    if region is None:
        region = "us-east-1"
    return _get_region_prefix(region) + model_id
 class BedrockInferenceAdapter(
    ModelRegistryHelper,
@ -166,8 +203,13 @@ class BedrockInferenceAdapter(
            options["repetition_penalty"] = sampling_params.repetition_penalty
        prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
        # Convert foundation model ID to inference profile ID
        region_name = self.client.meta.region_name
        inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
        return {
-            "modelId": bedrock_model,
+            "modelId": inference_profile_id,
            "body": json.dumps(
                {
                    "prompt": prompt,
@ -185,6 +227,11 @@ class BedrockInferenceAdapter(
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
        # Convert foundation model ID to inference profile ID
        region_name = self.client.meta.region_name
        inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
        embeddings = []
        for content in contents:
            assert not content_has_media(content), "Bedrock does not support media for embeddings"
@ -193,7 +240,7 @@ class BedrockInferenceAdapter(
            body = json.dumps(input_body)
            response = self.client.invoke_model(
                body=body,
-                modelId=model.provider_resource_id,
+                modelId=inference_profile_id,
                accept="application/json",
                contentType="application/json",
            )
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator
 from typing import Any
 import httpx
@ -38,13 +38,6 @@ from llama_stack.apis.inference import (
    LogProbConfig,
    Message,
    ModelStore,
    OpenAIChatCompletion,
    OpenAICompletion,
    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -71,11 +64,11 @@ from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    convert_tool_call,
    get_sampling_options,
    prepare_openai_completion_params,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
    completion_request_to_prompt,
    content_has_media,
@ -288,7 +281,7 @@ async def _process_vllm_chat_completion_stream_response(
        yield c
-class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
+class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
    # automatically set by the resolver when instantiating the provider
    __provider_id__: str
    model_store: ModelStore | None = None
@ -296,7 +289,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
        self.config = config
        self.client = None
    async def initialize(self) -> None:
        if not self.config.url:
@ -308,8 +300,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        return self.config.refresh_models
    async def list_models(self) -> list[Model] | None:
        self._lazy_initialize_client()
        assert self.client is not None  # mypy
        models = []
        async for m in self.client.models.list():
            model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
@ -340,8 +330,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            HealthResponse: A dictionary containing the health status.
        """
        try:
-            client = self._create_client() if self.client is None else self.client
+            _ = [m async for m in self.client.models.list()]  # Ensure the client is initialized
            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
            return HealthResponse(status=HealthStatus.OK)
        except Exception as e:
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@ -351,19 +340,14 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            raise ValueError("Model store not set")
        return await self.model_store.get_model(model_id)
-    def _lazy_initialize_client(self):
+    def get_api_key(self):
-        if self.client is not None:
+        return self.config.api_token
            return
-        log.info(f"Initializing vLLM client with base_url={self.config.url}")
+    def get_base_url(self):
-        self.client = self._create_client()
+        return self.config.url
-    def _create_client(self):
+    def get_extra_client_params(self):
-        return AsyncOpenAI(
+        return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
            base_url=self.config.url,
            api_key=self.config.api_token,
            http_client=httpx.AsyncClient(verify=self.config.tls_verify),
        )
    async def completion(
        self,
@ -374,7 +358,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        stream: bool | None = False,
        logprobs: LogProbConfig | None = None,
    ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
        self._lazy_initialize_client()
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self._get_model(model_id)
@ -406,7 +389,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        logprobs: LogProbConfig | None = None,
        tool_config: ToolConfig | None = None,
    ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
        self._lazy_initialize_client()
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self._get_model(model_id)
@ -479,16 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            yield chunk
    async def register_model(self, model: Model) -> Model:
        # register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet.
        # self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
        # Changing this may lead to unpredictable behavior.
        client = self._create_client() if self.client is None else self.client
        try:
            model = await self.register_helper.register_model(model)
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing
        try:
-            res = await client.models.list()
+            res = await self.client.models.list()
        except APIConnectionError as e:
            raise ValueError(
                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
@ -543,8 +521,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        output_dimension: int | None = None,
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        self._lazy_initialize_client()
        assert self.client is not None
        model = await self._get_model(model_id)
        kwargs = {}
@ -560,154 +536,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        embeddings = [data.embedding for data in response.data]
        return EmbeddingsResponse(embeddings=embeddings)
    async def openai_embeddings(
        self,
        model: str,
        input: str | list[str],
        encoding_format: str | None = "float",
        dimensions: int | None = None,
        user: str | None = None,
    ) -> OpenAIEmbeddingsResponse:
        self._lazy_initialize_client()
        assert self.client is not None
        model_obj = await self._get_model(model)
        assert model_obj.model_type == ModelType.embedding
        # Convert input to list if it's a string
        input_list = [input] if isinstance(input, str) else input
        # Call vLLM embeddings endpoint with encoding_format
        response = await self.client.embeddings.create(
            model=model_obj.provider_resource_id,
            input=input_list,
            dimensions=dimensions,
            encoding_format=encoding_format,
        )
        # Convert response to OpenAI format
        data = [
            OpenAIEmbeddingData(
                embedding=embedding_data.embedding,
                index=i,
            )
            for i, embedding_data in enumerate(response.data)
        ]
        # Not returning actual token usage since vLLM doesn't provide it
        usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
        return OpenAIEmbeddingsResponse(
            data=data,
            model=model_obj.provider_resource_id,
            usage=usage,
        )
    async def openai_completion(
        self,
        model: str,
        prompt: str | list[str] | list[int] | list[list[int]],
        best_of: int | None = None,
        echo: bool | None = None,
        frequency_penalty: float | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        presence_penalty: float | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        top_p: float | None = None,
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
        suffix: str | None = None,
    ) -> OpenAICompletion:
        self._lazy_initialize_client()
        model_obj = await self._get_model(model)
        extra_body: dict[str, Any] = {}
        if prompt_logprobs is not None and prompt_logprobs >= 0:
            extra_body["prompt_logprobs"] = prompt_logprobs
        if guided_choice:
            extra_body["guided_choice"] = guided_choice
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            prompt=prompt,
            best_of=best_of,
            echo=echo,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            top_p=top_p,
            user=user,
            extra_body=extra_body,
        )
        return await self.client.completions.create(**params)  # type: ignore
    async def openai_chat_completion(
        self,
        model: str,
        messages: list[OpenAIMessageParam],
        frequency_penalty: float | None = None,
        function_call: str | dict[str, Any] | None = None,
        functions: list[dict[str, Any]] | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_completion_tokens: int | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        parallel_tool_calls: bool | None = None,
        presence_penalty: float | None = None,
        response_format: OpenAIResponseFormatParam | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        tool_choice: str | dict[str, Any] | None = None,
        tools: list[dict[str, Any]] | None = None,
        top_logprobs: int | None = None,
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        self._lazy_initialize_client()
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        return await self.client.chat.completions.create(**params)  # type: ignore
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@ -3,6 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 from typing import Any
 from sqlalchemy.exc import IntegrityError
 from llama_stack.apis.inference import (
    ListOpenAIChatCompletionResponse,
    OpenAIChatCompletion,
@ -10,24 +15,43 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
 )
-from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
-from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.log import get_logger
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
 logger = get_logger(name=__name__, category="inference_store")
 class InferenceStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
+    def __init__(
-        if not sql_store_config:
+        self,
-            sql_store_config = SqliteSqlStoreConfig(
+        config: InferenceStoreConfig | SqlStoreConfig,
-                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+        policy: list[AccessRule],
    ):
        # Handle backward compatibility
        if not isinstance(config, InferenceStoreConfig):
            # Legacy: SqlStoreConfig passed directly as config
            config = InferenceStoreConfig(
                sql_store_config=config,
            )
-        self.sql_store_config = sql_store_config
+
        self.config = config
        self.sql_store_config = config.sql_store_config
        self.sql_store = None
        self.policy = policy
        # Disable write queue for SQLite to avoid concurrency issues
        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
        # Async write queue and worker control
        self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
        self._worker_tasks: list[asyncio.Task[Any]] = []
        self._max_write_queue_size: int = config.max_write_queue_size
        self._num_writers: int = max(1, config.num_writers)
    async def initialize(self):
        """Create the necessary tables if they don't exist."""
        self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@ -42,23 +66,109 @@ class InferenceStore:
            },
        )
        if self.enable_write_queue:
            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
            for _ in range(self._num_writers):
                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
        else:
            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
    async def shutdown(self) -> None:
        if not self._worker_tasks:
            return
        if self._queue is not None:
            await self._queue.join()
        for t in self._worker_tasks:
            if not t.done():
                t.cancel()
        for t in self._worker_tasks:
            try:
                await t
            except asyncio.CancelledError:
                pass
        self._worker_tasks.clear()
    async def flush(self) -> None:
        """Wait for all queued writes to complete. Useful for testing."""
        if self.enable_write_queue and self._queue is not None:
            await self._queue.join()
    async def store_chat_completion(
        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
    ) -> None:
-        if not self.sql_store:
+        if self.enable_write_queue:
            if self._queue is None:
                raise ValueError("Inference store is not initialized")
            try:
                self._queue.put_nowait((chat_completion, input_messages))
            except asyncio.QueueFull:
                logger.warning(
                    f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
                )
                await self._queue.put((chat_completion, input_messages))
        else:
            await self._write_chat_completion(chat_completion, input_messages)
    async def _worker_loop(self) -> None:
        assert self._queue is not None
        while True:
            try:
                item = await self._queue.get()
            except asyncio.CancelledError:
                break
            chat_completion, input_messages = item
            try:
                await self._write_chat_completion(chat_completion, input_messages)
            except Exception as e:  # noqa: BLE001
                logger.error(f"Error writing chat completion: {e}")
            finally:
                self._queue.task_done()
    async def _write_chat_completion(
        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
    ) -> None:
        if self.sql_store is None:
            raise ValueError("Inference store is not initialized")
        data = chat_completion.model_dump()
-
+        record_data = {
        await self.sql_store.insert(
            table="chat_completions",
            data={
            "id": data["id"],
            "created": data["created"],
            "model": data["model"],
            "choices": data["choices"],
            "input_messages": [message.model_dump() for message in input_messages],
-            },
+        }
        try:
            await self.sql_store.insert(
                table="chat_completions",
                data=record_data,
            )
        except IntegrityError as e:
            # Duplicate chat completion IDs can be generated during tests especially if they are replaying
            # recorded responses across different tests. No need to warn or error under those circumstances.
            # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
            # Check if it's a unique constraint violation
            error_message = str(e.orig) if e.orig else str(e)
            if self._is_unique_constraint_error(error_message):
                # Update the existing record instead
                await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
            else:
                # Re-raise if it's not a unique constraint error
                raise
    def _is_unique_constraint_error(self, error_message: str) -> bool:
        """Check if the error is specifically a unique constraint violation."""
        error_lower = error_message.lower()
        return any(
            indicator in error_lower
            for indicator in [
                "unique constraint failed",  # SQLite
                "duplicate key",  # PostgreSQL
                "unique violation",  # PostgreSQL alternative
                "duplicate entry",  # MySQL
            ]
        )
    async def list_chat_completions(
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -67,6 +67,17 @@ class OpenAIMixin(ABC):
        """
        pass
    def get_extra_client_params(self) -> dict[str, Any]:
        """
        Get any extra parameters to pass to the AsyncOpenAI client.
        Child classes can override this method to provide additional parameters
        such as timeout settings, proxies, etc.
        :return: A dictionary of extra parameters
        """
        return {}
    @property
    def client(self) -> AsyncOpenAI:
        """
@ -78,6 +89,7 @@ class OpenAIMixin(ABC):
        return AsyncOpenAI(
            api_key=self.get_api_key(),
            base_url=self.get_base_url(),
            **self.get_extra_client_params(),
        )
    async def _get_provider_model_id(self, model: str) -> str:
@ -124,10 +136,15 @@ class OpenAIMixin(ABC):
        """
        Direct OpenAI completion API call.
        """
-        if guided_choice is not None:
+        # Handle parameters that are not supported by OpenAI API, but may be by the provider
-            logger.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
+        #  prompt_logprobs is supported by vLLM
-        if prompt_logprobs is not None:
+        #  guided_choice is supported by vLLM
-            logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
+        # TODO: test coverage
        extra_body: dict[str, Any] = {}
        if prompt_logprobs is not None and prompt_logprobs >= 0:
            extra_body["prompt_logprobs"] = prompt_logprobs
        if guided_choice:
            extra_body["guided_choice"] = guided_choice
        # TODO: fix openai_completion to return type compatible with OpenAI's API response
        return await self.client.completions.create(  # type: ignore[no-any-return]
@ -150,7 +167,8 @@ class OpenAIMixin(ABC):
                top_p=top_p,
                user=user,
                suffix=suffix,
-            )
+            ),
            extra_body=extra_body,
        )
    async def openai_chat_completion(
--- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py
@ -172,6 +172,20 @@ class AuthorizedSqlStore:
        return results.data[0] if results.data else None
    async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
        """Update rows with automatic access control attribute capture."""
        enhanced_data = dict(data)
        current_user = get_authenticated_user()
        if current_user:
            enhanced_data["owner_principal"] = current_user.principal
            enhanced_data["access_attributes"] = current_user.attributes
        else:
            enhanced_data["owner_principal"] = None
            enhanced_data["access_attributes"] = None
        await self.sql_store.update(table, enhanced_data, where)
    async def delete(self, table: str, where: Mapping[str, Any]) -> None:
        """Delete rows with automatic access control filtering."""
        await self.sql_store.delete(table, where)
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@ -18,6 +18,7 @@ from functools import wraps
 from typing import Any
 from llama_stack.apis.telemetry import (
    Event,
    LogSeverity,
    Span,
    SpanEndPayload,
@ -98,7 +99,7 @@ class BackgroundLogger:
    def __init__(self, api: Telemetry, capacity: int = 100000):
        self.api = api
        self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
-        self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
+        self.worker_thread = threading.Thread(target=self._worker, daemon=True)
        self.worker_thread.start()
        self._last_queue_full_log_time: float = 0.0
        self._dropped_since_last_notice: int = 0
@ -118,12 +119,16 @@ class BackgroundLogger:
                self._last_queue_full_log_time = current_time
                self._dropped_since_last_notice = 0
-    def _process_logs(self):
+    def _worker(self):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self._process_logs())
    async def _process_logs(self):
        while True:
            try:
                event = self.log_queue.get()
-                # figure out how to use a thread's native loop
+                await self.api.log_event(event)
                asyncio.run(self.api.log_event(event))
            except Exception:
                import traceback
@ -136,6 +141,19 @@ class BackgroundLogger:
        self.log_queue.join()
 def enqueue_event(event: Event) -> None:
    """Enqueue a telemetry event to the background logger if available.
    This provides a non-blocking path for routers and other hot paths to
    submit telemetry without awaiting the Telemetry API, reducing contention
    with the main event loop.
    """
    global BACKGROUND_LOGGER
    if BACKGROUND_LOGGER is None:
        raise RuntimeError("Telemetry API not initialized")
    BACKGROUND_LOGGER.log_event(event)
 class TraceContext:
    spans: list[Span] = []
@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
        if record.module in ("asyncio", "selector_events"):
            return
-        global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
+        global CURRENT_TRACE_CONTEXT
        if BACKGROUND_LOGGER is None:
            raise RuntimeError("Telemetry API not initialized")
        context = CURRENT_TRACE_CONTEXT.get()
        if context is None:
            return
@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
        if span is None:
            return
-        BACKGROUND_LOGGER.log_event(
+        enqueue_event(
            UnstructuredLogEvent(
                trace_id=span.trace_id,
                span_id=span.span_id,
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@ -12,14 +12,12 @@ import uuid
 def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
    """
    Generate a unique chunk ID using a hash of the document ID and chunk text.
-
+    Then use the first 32 characters of the hash to create a UUID.
    Note: MD5 is used only to calculate an identifier, not for security purposes.
    Adding usedforsecurity=False for compatibility with FIPS environments.
    """
    hash_input = f"{document_id}:{chunk_text}".encode()
    if chunk_window:
        hash_input += f":{chunk_window}".encode()
-    return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
+    return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))
 def proper_case(s: str) -> str:
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -15,6 +15,8 @@ from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast
 from openai import NOT_GIVEN
 from llama_stack.log import get_logger
 logger = get_logger(__name__, category="testing")
@ -105,7 +107,11 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
            return cls.model_validate(data["__data__"])
        except (ImportError, AttributeError, TypeError, ValueError) as e:
-            logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}")
+            logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}")
            try:
                return cls.model_construct(**data["__data__"])
            except Exception as e:
                logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}")
                return data["__data__"]
    return data
@ -194,20 +200,15 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
        Supported endpoints:
        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
-        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
+        - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ]
        Returns a list of unique identifiers or None if structure doesn't match.
        """
-        body = response["body"]
+        items = response["body"]
-        if endpoint == "/api/tags":
+        idents = [m.model if endpoint == "/api/tags" else m.id for m in items]
            items = body.get("models")
            idents = [m.model for m in items]
        else:
            items = body.get("data")
            idents = [m.id for m in items]
        return sorted(set(idents))
    identifiers = _extract_model_identifiers()
-    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+    return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
@ -215,17 +216,12 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
        if endpoint == "/api/tags":
            items = body.models
        elif endpoint == "/v1/models":
            items = body.data
        else:
            items = []
        for m in items:
        if endpoint == "/v1/models":
            for m in body:
                key = m.id
-            else:
+                seen[key] = m
        elif endpoint == "/api/tags":
            for m in body.models:
                key = m.model
                seen[key] = m
@ -234,9 +230,8 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
    canonical_req = canonical.get("request", {})
    if isinstance(canonical_req, dict):
        canonical_req["endpoint"] = endpoint
-    if endpoint == "/v1/models":
+    body = ordered
-        body = {"data": ordered, "object": "list"}
+    if endpoint == "/api/tags":
    else:
        from ollama import ListResponse
        body = ListResponse(models=ordered)
@ -247,12 +242,17 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    global _current_mode, _current_storage
    if _current_mode == InferenceMode.LIVE or _current_storage is None:
-        # Normal operation
+        if endpoint == "/v1/models":
            return original_method(self, *args, **kwargs)
        else:
            return await original_method(self, *args, **kwargs)
    # Get base URL based on client type
    if client_type == "openai":
        base_url = str(self._client.base_url)
        # the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out
        kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN}
    elif client_type == "ollama":
        # Get base URL from the client (Ollama client uses host attribute)
        base_url = getattr(self, "host", "http://localhost:11434")
@ -296,8 +296,15 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
            )
    elif _current_mode == InferenceMode.RECORD:
        if endpoint == "/v1/models":
            response = original_method(self, *args, **kwargs)
        else:
            response = await original_method(self, *args, **kwargs)
        # we want to store the result of the iterator, not the iterator itself
        if endpoint == "/v1/models":
            response = [m async for m in response]
        request_data = {
            "method": method,
            "url": url,
@ -376,10 +383,14 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )
-    async def patched_models_list(self, *args, **kwargs):
+    def patched_models_list(self, *args, **kwargs):
-        return await _patched_inference_method(
+        async def _iter():
            for item in await _patched_inference_method(
                _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
-        )
+            ):
                yield item
        return _iter()
    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -11,7 +11,7 @@
        "@radix-ui/react-collapsible": "^1.1.12",
        "@radix-ui/react-dialog": "^1.1.13",
        "@radix-ui/react-dropdown-menu": "^2.1.16",
-        "@radix-ui/react-select": "^2.2.5",
+        "@radix-ui/react-select": "^2.2.6",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
        "@radix-ui/react-tooltip": "^1.2.8",
@ -20,7 +20,7 @@
        "framer-motion": "^12.23.12",
        "llama-stack-client": "^0.2.21",
        "lucide-react": "^0.542.0",
-        "next": "15.3.3",
+        "next": "15.5.3",
        "next-auth": "^4.24.11",
        "next-themes": "^0.4.6",
        "react": "^19.0.0",
@ -664,9 +664,9 @@
      }
    },
    "node_modules/@emnapi/runtime": {
-      "version": "1.4.3",
+      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.3.tgz",
+      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz",
-      "integrity": "sha512-pBPWdu6MLKROBX05wSNKcNb++m5Er+KQ9QkB+WVM+pW2Kx9hoSrVTnu3BdkI5eBLZoKu/J6mW/B6i6bJB2ytXQ==",
+      "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==",
      "license": "MIT",
      "optional": true,
      "dependencies": {
@ -927,9 +927,9 @@
      }
    },
    "node_modules/@img/sharp-darwin-arm64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.3.tgz",
-      "integrity": "sha512-pn44xgBtgpEbZsu+lWf2KNb6OAf70X68k+yk69Ic2Xz11zHR/w24/U49XT7AeRwJ0Px+mhALhU5LPci1Aymk7A==",
+      "integrity": "sha512-ryFMfvxxpQRsgZJqBd4wsttYQbCxsJksrv9Lw/v798JcQ8+w84mBWuXwl+TT0WJ/WrYOLaYpwQXi3sA9nTIaIg==",
      "cpu": [
        "arm64"
      ],
@ -945,13 +945,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-darwin-arm64": "1.1.0"
+        "@img/sharp-libvips-darwin-arm64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-darwin-x64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.3.tgz",
-      "integrity": "sha512-VfuYgG2r8BpYiOUN+BfYeFo69nP/MIwAtSJ7/Zpxc5QF3KS22z8Pvg3FkrSFJBPNQ7mmcUcYQFBmEQp7eu1F8Q==",
+      "integrity": "sha512-yHpJYynROAj12TA6qil58hmPmAwxKKC7reUqtGLzsOHfP7/rniNGTL8tjWX6L3CTV4+5P4ypcS7Pp+7OB+8ihA==",
      "cpu": [
        "x64"
      ],
@ -967,13 +967,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-darwin-x64": "1.1.0"
+        "@img/sharp-libvips-darwin-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-libvips-darwin-arm64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.0.tgz",
-      "integrity": "sha512-HZ/JUmPwrJSoM4DIQPv/BfNh9yrOA8tlBbqbLz4JZ5uew2+o22Ik+tHQJcih7QJuSa0zo5coHTfD5J8inqj9DA==",
+      "integrity": "sha512-sBZmpwmxqwlqG9ueWFXtockhsxefaV6O84BMOrhtg/YqbTaRdqDE7hxraVE3y6gVM4eExmfzW4a8el9ArLeEiQ==",
      "cpu": [
        "arm64"
      ],
@ -987,9 +987,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-darwin-x64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.0.tgz",
-      "integrity": "sha512-Xzc2ToEmHN+hfvsl9wja0RlnXEgpKNmftriQp6XzY/RaSfwD9th+MSh0WQKzUreLKKINb3afirxW7A0fz2YWuQ==",
+      "integrity": "sha512-M64XVuL94OgiNHa5/m2YvEQI5q2cl9d/wk0qFTDVXcYzi43lxuiFTftMR1tOnFQovVXNZJ5TURSDK2pNe9Yzqg==",
      "cpu": [
        "x64"
      ],
@ -1003,9 +1003,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-arm": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.0.tgz",
-      "integrity": "sha512-s8BAd0lwUIvYCJyRdFqvsj+BJIpDBSxs6ivrOPm/R7piTs5UIwY5OjXrP2bqXC9/moGsyRa37eYWYCOGVXxVrA==",
+      "integrity": "sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==",
      "cpu": [
        "arm"
      ],
@ -1019,9 +1019,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-arm64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.0.tgz",
-      "integrity": "sha512-IVfGJa7gjChDET1dK9SekxFFdflarnUB8PwW8aGwEoF3oAsSDuNUTYS+SKDOyOJxQyDC1aPFMuRYLoDInyV9Ew==",
+      "integrity": "sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==",
      "cpu": [
        "arm64"
      ],
@ -1035,9 +1035,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-ppc64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.0.tgz",
-      "integrity": "sha512-tiXxFZFbhnkWE2LA8oQj7KYR+bWBkiV2nilRldT7bqoEZ4HiDOcePr9wVDAZPi/Id5fT1oY9iGnDq20cwUz8lQ==",
+      "integrity": "sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==",
      "cpu": [
        "ppc64"
      ],
@ -1051,9 +1051,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-s390x": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.0.tgz",
-      "integrity": "sha512-xukSwvhguw7COyzvmjydRb3x/09+21HykyapcZchiCUkTThEQEOMtBj9UhkaBRLuBrgLFzQ2wbxdeCCJW/jgJA==",
+      "integrity": "sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==",
      "cpu": [
        "s390x"
      ],
@ -1067,9 +1067,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-x64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.0.tgz",
-      "integrity": "sha512-yRj2+reB8iMg9W5sULM3S74jVS7zqSzHG3Ol/twnAAkAhnGQnpjj6e4ayUz7V+FpKypwgs82xbRdYtchTTUB+Q==",
+      "integrity": "sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==",
      "cpu": [
        "x64"
      ],
@ -1083,9 +1083,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linuxmusl-arm64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.0.tgz",
-      "integrity": "sha512-jYZdG+whg0MDK+q2COKbYidaqW/WTz0cc1E+tMAusiDygrM4ypmSCjOJPmFTvHHJ8j/6cAGyeDWZOsK06tP33w==",
+      "integrity": "sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==",
      "cpu": [
        "arm64"
      ],
@ -1099,9 +1099,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linuxmusl-x64": {
-      "version": "1.1.0",
+      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.1.0.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.0.tgz",
-      "integrity": "sha512-wK7SBdwrAiycjXdkPnGCPLjYb9lD4l6Ze2gSdAGVZrEL05AOUJESWU2lhlC+Ffn5/G+VKuSm6zzbQSzFX/P65A==",
+      "integrity": "sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==",
      "cpu": [
        "x64"
      ],
@ -1115,9 +1115,9 @@
      }
    },
    "node_modules/@img/sharp-linux-arm": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.3.tgz",
-      "integrity": "sha512-anKiszvACti2sGy9CirTlNyk7BjjZPiML1jt2ZkTdcvpLU1YH6CXwRAZCA2UmRXnhiIftXQ7+Oh62Ji25W72jA==",
+      "integrity": "sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==",
      "cpu": [
        "arm"
      ],
@ -1133,13 +1133,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm": "1.1.0"
+        "@img/sharp-libvips-linux-arm": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-arm64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.3.tgz",
-      "integrity": "sha512-kX2c+vbvaXC6vly1RDf/IWNXxrlxLNpBVWkdpRq5Ka7OOKj6nr66etKy2IENf6FtOgklkg9ZdGpEu9kwdlcwOQ==",
+      "integrity": "sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==",
      "cpu": [
        "arm64"
      ],
@ -1155,13 +1155,35 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm64": "1.1.0"
+        "@img/sharp-libvips-linux-arm64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-ppc64": {
      "version": "0.34.3",
      "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.3.tgz",
      "integrity": "sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==",
      "cpu": [
        "ppc64"
      ],
      "license": "Apache-2.0",
      "optional": true,
      "os": [
        "linux"
      ],
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
      },
      "funding": {
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
        "@img/sharp-libvips-linux-ppc64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-s390x": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.3.tgz",
-      "integrity": "sha512-7s0KX2tI9mZI2buRipKIw2X1ufdTeaRgwmRabt5bi9chYfhur+/C1OXg3TKg/eag1W+6CCWLVmSauV1owmRPxA==",
+      "integrity": "sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==",
      "cpu": [
        "s390x"
      ],
@ -1177,13 +1199,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-s390x": "1.1.0"
+        "@img/sharp-libvips-linux-s390x": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-x64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.3.tgz",
-      "integrity": "sha512-wExv7SH9nmoBW3Wr2gvQopX1k8q2g5V5Iag8Zk6AVENsjwd+3adjwxtp3Dcu2QhOXr8W9NusBU6XcQUohBZ5MA==",
+      "integrity": "sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==",
      "cpu": [
        "x64"
      ],
@ -1199,13 +1221,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-x64": "1.1.0"
+        "@img/sharp-libvips-linux-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linuxmusl-arm64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.3.tgz",
-      "integrity": "sha512-DfvyxzHxw4WGdPiTF0SOHnm11Xv4aQexvqhRDAoD00MzHekAj9a/jADXeXYCDFH/DzYruwHbXU7uz+H+nWmSOQ==",
+      "integrity": "sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==",
      "cpu": [
        "arm64"
      ],
@ -1221,13 +1243,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linuxmusl-x64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.3.tgz",
-      "integrity": "sha512-pax/kTR407vNb9qaSIiWVnQplPcGU8LRIJpDT5o8PdAx5aAA7AS3X9PS8Isw1/WfqgQorPotjrZL3Pqh6C5EBg==",
+      "integrity": "sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==",
      "cpu": [
        "x64"
      ],
@ -1243,20 +1265,20 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-wasm32": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.3.tgz",
-      "integrity": "sha512-YDybQnYrLQfEpzGOQe7OKcyLUCML4YOXl428gOOzBgN6Gw0rv8dpsJ7PqTHxBnXnwXr8S1mYFSLSa727tpz0xg==",
+      "integrity": "sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==",
      "cpu": [
        "wasm32"
      ],
      "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT",
      "optional": true,
      "dependencies": {
-        "@emnapi/runtime": "^1.4.0"
+        "@emnapi/runtime": "^1.4.4"
      },
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -1265,10 +1287,29 @@
        "url": "https://opencollective.com/libvips"
      }
    },
    "node_modules/@img/sharp-win32-arm64": {
      "version": "0.34.3",
      "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.3.tgz",
      "integrity": "sha512-MjnHPnbqMXNC2UgeLJtX4XqoVHHlZNd+nPt1kRPmj63wURegwBhZlApELdtxM2OIZDRv/DFtLcNhVbd1z8GYXQ==",
      "cpu": [
        "arm64"
      ],
      "license": "Apache-2.0 AND LGPL-3.0-or-later",
      "optional": true,
      "os": [
        "win32"
      ],
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
      },
      "funding": {
        "url": "https://opencollective.com/libvips"
      }
    },
    "node_modules/@img/sharp-win32-ia32": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.3.tgz",
-      "integrity": "sha512-WKf/NAZITnonBf3U1LfdjoMgNO5JYRSlhovhRhMxXVdvWYveM4kM3L8m35onYIdh75cOMCo1BexgVQcCDzyoWw==",
+      "integrity": "sha512-xuCdhH44WxuXgOM714hn4amodJMZl3OEvf0GVTm0BEyMeA2to+8HEdRPShH0SLYptJY1uBw+SCFP9WVQi1Q/cw==",
      "cpu": [
        "ia32"
      ],
@ -1285,9 +1326,9 @@
      }
    },
    "node_modules/@img/sharp-win32-x64": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.3.tgz",
-      "integrity": "sha512-hw1iIAHpNE8q3uMIRCgGOeDoz9KtFNarFLQclLxr/LK1VBkj8nby18RjFvr6aP7USRYAjTZW6yisnBWMX571Tw==",
+      "integrity": "sha512-OWwz05d++TxzLEv4VnsTz5CmZ6mI6S05sfQGEMrNrQcOEERbX46332IvE7pO/EUiw7jUrrS40z/M7kPyjfl04g==",
      "cpu": [
        "x64"
      ],
@ -1849,9 +1890,10 @@
      }
    },
    "node_modules/@next/env": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz",
-      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
+      "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==",
      "license": "MIT"
    },
    "node_modules/@next/eslint-plugin-next": {
      "version": "15.5.2",
@ -1864,12 +1906,13 @@
      }
    },
    "node_modules/@next/swc-darwin-arm64": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz",
-      "integrity": "sha512-WRJERLuH+O3oYB4yZNVahSVFmtxRNjNF1I1c34tYMoJb0Pve+7/RaLAJJizyYiFhjYNGHRAE1Ri2Fd23zgDqhg==",
+      "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==",
      "cpu": [
        "arm64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "darwin"
@ -1879,12 +1922,13 @@
      }
    },
    "node_modules/@next/swc-darwin-x64": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz",
-      "integrity": "sha512-XHdzH/yBc55lu78k/XwtuFR/ZXUTcflpRXcsu0nKmF45U96jt1tsOZhVrn5YH+paw66zOANpOnFQ9i6/j+UYvw==",
+      "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==",
      "cpu": [
        "x64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "darwin"
@ -1894,12 +1938,13 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz",
-      "integrity": "sha512-VZ3sYL2LXB8znNGcjhocikEkag/8xiLgnvQts41tq6i+wql63SMS1Q6N8RVXHw5pEUjiof+II3HkDd7GFcgkzw==",
+      "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==",
      "cpu": [
        "arm64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1909,12 +1954,13 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz",
-      "integrity": "sha512-h6Y1fLU4RWAp1HPNJWDYBQ+e3G7sLckyBXhmH9ajn8l/RSMnhbuPBV/fXmy3muMcVwoJdHL+UtzRzs0nXOf9SA==",
+      "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==",
      "cpu": [
        "arm64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1924,12 +1970,13 @@
      }
    },
    "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz",
-      "integrity": "sha512-jJ8HRiF3N8Zw6hGlytCj5BiHyG/K+fnTKVDEKvUCyiQ/0r5tgwO7OgaRiOjjRoIx2vwLR+Rz8hQoPrnmFbJdfw==",
+      "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==",
      "cpu": [
        "x64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1939,12 +1986,13 @@
      }
    },
    "node_modules/@next/swc-linux-x64-musl": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz",
-      "integrity": "sha512-HrUcTr4N+RgiiGn3jjeT6Oo208UT/7BuTr7K0mdKRBtTbT4v9zJqCDKO97DUqqoBK1qyzP1RwvrWTvU6EPh/Cw==",
+      "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==",
      "cpu": [
        "x64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1954,12 +2002,13 @@
      }
    },
    "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz",
-      "integrity": "sha512-SxorONgi6K7ZUysMtRF3mIeHC5aA3IQLmKFQzU0OuhuUYwpOBc1ypaLJLP5Bf3M9k53KUUUj4vTPwzGvl/NwlQ==",
+      "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==",
      "cpu": [
        "arm64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "win32"
@ -1969,12 +2018,13 @@
      }
    },
    "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz",
-      "integrity": "sha512-4QZG6F8enl9/S2+yIiOiju0iCTFd93d8VC1q9LZS4p/Xuk81W2QDjCFeoogmrWWkAD59z8ZxepBQap2dKS5ruw==",
+      "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==",
      "cpu": [
        "x64"
      ],
      "license": "MIT",
      "optional": true,
      "os": [
        "win32"
@ -2874,22 +2924,22 @@
      }
    },
    "node_modules/@radix-ui/react-select": {
-      "version": "2.2.5",
+      "version": "2.2.6",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz",
-      "integrity": "sha512-HnMTdXEVuuyzx63ME0ut4+sEMYW6oouHWNGUZc7ddvUWIcfCva/AMoqEW/3wnEllriMWBa0RHspCYnfCWJQYmA==",
+      "integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/number": "1.1.1",
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-collection": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-direction": "1.1.1",
-        "@radix-ui/react-dismissable-layer": "1.1.10",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
-        "@radix-ui/react-focus-guards": "1.1.2",
+        "@radix-ui/react-focus-guards": "1.1.3",
        "@radix-ui/react-focus-scope": "1.1.7",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.7",
+        "@radix-ui/react-popper": "1.2.8",
        "@radix-ui/react-portal": "1.1.9",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-slot": "1.2.3",
@ -2916,13 +2966,19 @@
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/primitive": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
-      "version": "1.1.10",
+      "version": "1.1.11",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
-      "integrity": "sha512-IM1zzRV4W3HtVgftdQiiOmA0AdJlCtMLe00FXaHwgt3rAnNsIyDqshvkIW3hj/iu5hu8ERP7KIYki6NkqDxAwQ==",
+      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
@ -2943,6 +2999,21 @@
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-guards": {
      "version": "1.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
      "integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
      "license": "MIT",
      "peerDependencies": {
        "@types/react": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
@ -2968,38 +3039,6 @@
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-popper": {
      "version": "1.2.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.7.tgz",
      "integrity": "sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==",
      "license": "MIT",
      "dependencies": {
        "@floating-ui/react-dom": "^2.0.0",
        "@radix-ui/react-arrow": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-layout-effect": "1.1.1",
        "@radix-ui/react-use-rect": "1.1.1",
        "@radix-ui/react-use-size": "1.1.1",
        "@radix-ui/rect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": {
      "version": "1.1.9",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3547,12 +3586,6 @@
        "@sinonjs/commons": "^3.0.0"
      }
    },
    "node_modules/@swc/counter": {
      "version": "0.1.3",
      "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz",
      "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==",
      "license": "Apache-2.0"
    },
    "node_modules/@swc/helpers": {
      "version": "0.5.15",
      "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz",
@ -3578,6 +3611,13 @@
        "tailwindcss": "4.1.6"
      }
    },
    "node_modules/@tailwindcss/node/node_modules/tailwindcss": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@tailwindcss/oxide": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@ -3838,6 +3878,13 @@
        "tailwindcss": "4.1.6"
      }
    },
    "node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
      "version": "4.1.6",
      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
      "dev": true,
      "license": "MIT"
    },
    "node_modules/@testing-library/dom": {
      "version": "10.4.1",
      "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@ -5461,17 +5508,6 @@
      "dev": true,
      "license": "MIT"
    },
    "node_modules/busboy": {
      "version": "1.6.0",
      "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
      "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
      "dependencies": {
        "streamsearch": "^1.1.0"
      },
      "engines": {
        "node": ">=10.16.0"
      }
    },
    "node_modules/bytes": {
      "version": "3.1.2",
      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@ -8281,9 +8317,9 @@
      }
    },
    "node_modules/is-arrayish": {
-      "version": "0.3.2",
+      "version": "0.3.4",
-      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
-      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==",
+      "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
      "license": "MIT",
      "optional": true
    },
@ -11528,14 +11564,13 @@
      }
    },
    "node_modules/next": {
-      "version": "15.3.3",
+      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/next/-/next-15.3.3.tgz",
+      "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz",
-      "integrity": "sha512-JqNj29hHNmCLtNvd090SyRbXJiivQ+58XjCcrC50Crb5g5u2zi7Y2YivbsEfzk6AtVI80akdOQbaMZwWB1Hthw==",
+      "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==",
      "license": "MIT",
      "dependencies": {
-        "@next/env": "15.3.3",
+        "@next/env": "15.5.3",
        "@swc/counter": "0.1.3",
        "@swc/helpers": "0.5.15",
        "busboy": "1.6.0",
        "caniuse-lite": "^1.0.30001579",
        "postcss": "8.4.31",
        "styled-jsx": "5.1.6"
@ -11547,19 +11582,19 @@
        "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
      },
      "optionalDependencies": {
-        "@next/swc-darwin-arm64": "15.3.3",
+        "@next/swc-darwin-arm64": "15.5.3",
-        "@next/swc-darwin-x64": "15.3.3",
+        "@next/swc-darwin-x64": "15.5.3",
-        "@next/swc-linux-arm64-gnu": "15.3.3",
+        "@next/swc-linux-arm64-gnu": "15.5.3",
-        "@next/swc-linux-arm64-musl": "15.3.3",
+        "@next/swc-linux-arm64-musl": "15.5.3",
-        "@next/swc-linux-x64-gnu": "15.3.3",
+        "@next/swc-linux-x64-gnu": "15.5.3",
-        "@next/swc-linux-x64-musl": "15.3.3",
+        "@next/swc-linux-x64-musl": "15.5.3",
-        "@next/swc-win32-arm64-msvc": "15.3.3",
+        "@next/swc-win32-arm64-msvc": "15.5.3",
-        "@next/swc-win32-x64-msvc": "15.3.3",
+        "@next/swc-win32-x64-msvc": "15.5.3",
-        "sharp": "^0.34.1"
+        "sharp": "^0.34.3"
      },
      "peerDependencies": {
        "@opentelemetry/api": "^1.1.0",
-        "@playwright/test": "^1.41.2",
+        "@playwright/test": "^1.51.1",
        "babel-plugin-react-compiler": "*",
        "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
        "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
@ -13226,16 +13261,16 @@
      "license": "ISC"
    },
    "node_modules/sharp": {
-      "version": "0.34.1",
+      "version": "0.34.3",
-      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.1.tgz",
+      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz",
-      "integrity": "sha512-1j0w61+eVxu7DawFJtnfYcvSv6qPFvfTaqzTQ2BLknVhHTwGS8sc63ZBF4rzkWMBVKybo4S5OBtDdZahh2A1xg==",
+      "integrity": "sha512-eX2IQ6nFohW4DbvHIOLRB3MHFpYqaqvXd3Tp5e/T/dSH83fxaNJQRvDMhASmkNTsNTVF2/OOopzRCt7xokgPfg==",
      "hasInstallScript": true,
      "license": "Apache-2.0",
      "optional": true,
      "dependencies": {
        "color": "^4.2.3",
-        "detect-libc": "^2.0.3",
+        "detect-libc": "^2.0.4",
-        "semver": "^7.7.1"
+        "semver": "^7.7.2"
      },
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -13244,26 +13279,28 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-darwin-arm64": "0.34.1",
+        "@img/sharp-darwin-arm64": "0.34.3",
-        "@img/sharp-darwin-x64": "0.34.1",
+        "@img/sharp-darwin-x64": "0.34.3",
-        "@img/sharp-libvips-darwin-arm64": "1.1.0",
+        "@img/sharp-libvips-darwin-arm64": "1.2.0",
-        "@img/sharp-libvips-darwin-x64": "1.1.0",
+        "@img/sharp-libvips-darwin-x64": "1.2.0",
-        "@img/sharp-libvips-linux-arm": "1.1.0",
+        "@img/sharp-libvips-linux-arm": "1.2.0",
-        "@img/sharp-libvips-linux-arm64": "1.1.0",
+        "@img/sharp-libvips-linux-arm64": "1.2.0",
-        "@img/sharp-libvips-linux-ppc64": "1.1.0",
+        "@img/sharp-libvips-linux-ppc64": "1.2.0",
-        "@img/sharp-libvips-linux-s390x": "1.1.0",
+        "@img/sharp-libvips-linux-s390x": "1.2.0",
-        "@img/sharp-libvips-linux-x64": "1.1.0",
+        "@img/sharp-libvips-linux-x64": "1.2.0",
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0",
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0",
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0",
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0",
-        "@img/sharp-linux-arm": "0.34.1",
+        "@img/sharp-linux-arm": "0.34.3",
-        "@img/sharp-linux-arm64": "0.34.1",
+        "@img/sharp-linux-arm64": "0.34.3",
-        "@img/sharp-linux-s390x": "0.34.1",
+        "@img/sharp-linux-ppc64": "0.34.3",
-        "@img/sharp-linux-x64": "0.34.1",
+        "@img/sharp-linux-s390x": "0.34.3",
-        "@img/sharp-linuxmusl-arm64": "0.34.1",
+        "@img/sharp-linux-x64": "0.34.3",
-        "@img/sharp-linuxmusl-x64": "0.34.1",
+        "@img/sharp-linuxmusl-arm64": "0.34.3",
-        "@img/sharp-wasm32": "0.34.1",
+        "@img/sharp-linuxmusl-x64": "0.34.3",
-        "@img/sharp-win32-ia32": "0.34.1",
+        "@img/sharp-wasm32": "0.34.3",
-        "@img/sharp-win32-x64": "0.34.1"
+        "@img/sharp-win32-arm64": "0.34.3",
        "@img/sharp-win32-ia32": "0.34.3",
        "@img/sharp-win32-x64": "0.34.3"
      }
    },
    "node_modules/shebang-command": {
@ -13389,9 +13426,9 @@
      "license": "ISC"
    },
    "node_modules/simple-swizzle": {
-      "version": "0.2.2",
+      "version": "0.2.4",
-      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
+      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
-      "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
+      "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
      "license": "MIT",
      "optional": true,
      "dependencies": {
@ -13512,14 +13549,6 @@
        "node": ">= 0.8"
      }
    },
    "node_modules/streamsearch": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
      "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
      "engines": {
        "node": ">=10.0.0"
      }
    },
    "node_modules/string-length": {
      "version": "4.0.2",
      "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
@ -13843,9 +13872,9 @@
      }
    },
    "node_modules/tailwindcss": {
-      "version": "4.1.6",
+      "version": "4.1.13",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
+      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
-      "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
+      "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
      "dev": true,
      "license": "MIT"
    },
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -16,7 +16,7 @@
    "@radix-ui/react-collapsible": "^1.1.12",
    "@radix-ui/react-dialog": "^1.1.13",
    "@radix-ui/react-dropdown-menu": "^2.1.16",
-    "@radix-ui/react-select": "^2.2.5",
+    "@radix-ui/react-select": "^2.2.6",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
    "@radix-ui/react-tooltip": "^1.2.8",
@ -25,7 +25,7 @@
    "framer-motion": "^12.23.12",
    "llama-stack-client": "^0.2.21",
    "lucide-react": "^0.542.0",
-    "next": "15.3.3",
+    "next": "15.5.3",
    "next-auth": "^4.24.11",
    "next-themes": "^0.4.6",
    "react": "^19.0.0",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -32,7 +32,7 @@ dependencies = [
    "jinja2>=3.1.6",
    "jsonschema",
    "llama-stack-client>=0.2.21",
-    "openai>=1.99.6",
+    "openai>=1.100.0",                                # for expires_after support
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
@ -80,7 +80,6 @@ dev = [
 unit = [
    "sqlite-vec",
    "ollama",
    "openai",
    "aiosqlite",
    "aiohttp",
    "psycopg2-binary>=2.9.0",
@ -105,7 +104,6 @@ unit = [
 # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
 # dependencies.
 test = [
    "openai>=1.100.0",  # for expires_after support
    "aiosqlite",
    "aiohttp",
    "torch>=2.6.0",
@ -356,6 +354,7 @@ warn_required_dynamic_aliases = true
 classmethod-decorators = ["classmethod", "pydantic.field_validator"]
 [tool.pytest.ini_options]
 addopts = ["--durations=10"]
 asyncio_mode = "auto"
 markers = [
    "allow_network: Allow network access for specific unit tests",
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@ -239,8 +239,9 @@ echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""
 # Prepare inputs for gh workflow run
 INPUTS=
 if [[ -n "$TEST_SUBDIRS" ]]; then
-    INPUTS="-f subdirs='$TEST_SUBDIRS'"
+    INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
 fi
 if [[ -n "$TEST_SETUP" ]]; then
    INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -6,12 +6,25 @@
 import time
 import unicodedata
 import pytest
 from ..test_cases.test_case import TestCase
 def _normalize_text(text: str) -> str:
    """
    Normalize Unicode text by removing diacritical marks for comparison.
    The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
    in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
    Latin spelling. The test is failing because it's doing a simple case-insensitive string search
    for "sol" but the actual response contains the diacritical mark.
    """
    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
 def provider_from_model(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
        "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
        "remote::azure",  # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
        #  again. You can learn more about which models can be used with each operation here:
        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.text) > 5
-    assert "france" in choice.text.lower()
+    normalized_text = _normalize_text(choice.text)
    assert "france" in normalized_text
@pytest.mark.parametrize(
@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
-    assert expected.lower() in message_content
+    normalized_expected = _normalize_text(expected)
    normalized_content = _normalize_text(message_content)
    assert normalized_expected in normalized_content
@pytest.mark.parametrize(
@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
    )
    streamed_content = []
    for chunk in response:
-        if chunk.choices[0].delta.content:
+        # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
        if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
+    normalized_expected = _normalize_text(expected)
    normalized_content = _normalize_text("".join(streamed_content))
    assert normalized_expected in normalized_content
@pytest.mark.parametrize(
@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
                    streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
                )
    assert len(streamed_content) == 2
    normalized_expected = _normalize_text(expected)
    for i, content in streamed_content.items():
-        assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
+        normalized_content = _normalize_text(content)
        assert normalized_expected in normalized_content, (
            f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
        )
@pytest.mark.parametrize(
@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if chunk.choices[0].delta.content:
+            if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                content += chunk.choices[0].delta.content
    else:
        response_id = response.id
@ -410,8 +437,9 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
            if chunk.choices and len(chunk.choices) > 0:
                if delta := chunk.choices[0].delta:
                    if delta.content:
                        content += delta.content
@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
-    assert "hello world" in message_content
+    normalized_content = _normalize_text(message_content)
    assert "hello world" in normalized_content
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id):
            "remote::vertexai",
            "remote::groq",
            "remote::sambanova",
            "remote::azure",
        )
        or "openai-compat" in provider.provider_type
    ):
@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova",):
+    if provider.provider_type in ("remote::sambanova", "remote::azure"):
        pytest.skip(
            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
        )
--- a/tests/integration/recordings/responses/0fda25b9241c.json
+++ b/tests/integration/recordings/responses/0fda25b9241c.json
@ -0,0 +1,71 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "Which planet do humans live on?"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-CECIXqfvjuluKkZtG3q2QJoSQhBU0",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Humans live on Earth \u2014 the third planet from the Sun. It's the only known planet that naturally supports life, with a breathable atmosphere, liquid water, and temperatures suitable for living organisms.",
              "refusal": null,
              "role": "assistant",
              "annotations": [],
              "audio": null,
              "function_call": null,
              "tool_calls": null
            },
            "content_filter_results": {}
          }
        ],
        "created": 1757499901,
        "model": "gpt-5-mini-2025-08-07",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 112,
          "prompt_tokens": 13,
          "total_tokens": 125,
          "completion_tokens_details": {
            "accepted_prediction_tokens": 0,
            "audio_tokens": 0,
            "reasoning_tokens": 64,
            "rejected_prediction_tokens": 0
          },
          "prompt_tokens_details": {
            "audio_tokens": 0,
            "cached_tokens": 0
          }
        },
        "prompt_filter_results": [
          {
            "prompt_index": 0,
            "content_filter_results": {}
          }
        ]
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/2b2ad549510d.json
+++ b/tests/integration/recordings/responses/2b2ad549510d.json
@ -0,0 +1,448 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "Hello, world!"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "",
          "choices": [],
          "created": 0,
          "model": "",
          "object": "",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null,
          "prompt_filter_results": [
            {
              "prompt_index": 0,
              "content_filter_results": {}
            }
          ]
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": "",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": "Hello",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": ",",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " world",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": "!",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " Hi",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " \u2014",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " how",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " can",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " I",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " help",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " you",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": " today",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": "?",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499910,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/57b67d1b1a36.json
+++ b/tests/integration/recordings/responses/57b67d1b1a36.json
@ -0,0 +1,71 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "Which planet has rings around it with a name starting with letter S?"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-CECIkT5cbqFazpungtewksVePcUNa",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Saturn. It's the planet famous for its prominent ring system made of ice and rock.",
              "refusal": null,
              "role": "assistant",
              "annotations": [],
              "audio": null,
              "function_call": null,
              "tool_calls": null
            },
            "content_filter_results": {}
          }
        ],
        "created": 1757499914,
        "model": "gpt-5-mini-2025-08-07",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 156,
          "prompt_tokens": 20,
          "total_tokens": 176,
          "completion_tokens_details": {
            "accepted_prediction_tokens": 0,
            "audio_tokens": 0,
            "reasoning_tokens": 128,
            "rejected_prediction_tokens": 0
          },
          "prompt_tokens_details": {
            "audio_tokens": 0,
            "cached_tokens": 0
          }
        },
        "prompt_filter_results": [
          {
            "prompt_index": 0,
            "content_filter_results": {}
          }
        ]
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/8752115f8d0c.json
+++ b/tests/integration/recordings/responses/8752115f8d0c.json
@ -0,0 +1,71 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "Hello, world!"
        }
      ],
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-CECIuyylsMNXspa83k8LrD8SQadNY",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "Hello! \ud83d\udc4b How can I help you today \u2014 answer a question, write or edit something, debug code, brainstorm ideas, or anything else?",
              "refusal": null,
              "role": "assistant",
              "annotations": [],
              "audio": null,
              "function_call": null,
              "tool_calls": null
            },
            "content_filter_results": {}
          }
        ],
        "created": 1757499924,
        "model": "gpt-5-mini-2025-08-07",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 40,
          "prompt_tokens": 10,
          "total_tokens": 50,
          "completion_tokens_details": {
            "accepted_prediction_tokens": 0,
            "audio_tokens": 0,
            "reasoning_tokens": 0,
            "rejected_prediction_tokens": 0
          },
          "prompt_tokens_details": {
            "audio_tokens": 0,
            "cached_tokens": 0
          }
        },
        "prompt_filter_results": [
          {
            "prompt_index": 0,
            "content_filter_results": {}
          }
        ]
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/94d11daee205.json
+++ b/tests/integration/recordings/responses/94d11daee205.json
--- a/tests/integration/recordings/responses/9f3d749cc1c8.json
+++ b/tests/integration/recordings/responses/9f3d749cc1c8.json
--- a/tests/integration/recordings/responses/c791119e6359.json
+++ b/tests/integration/recordings/responses/c791119e6359.json
@ -0,0 +1,98 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
        }
      ],
      "stream": false,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the weather in a given city",
            "parameters": {
              "type": "object",
              "properties": {
                "city": {
                  "type": "string",
                  "description": "The city to get the weather for"
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "chatcmpl-CECIwq9Odd0mOJMmw7ytv8iEazH4H",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": null,
              "refusal": null,
              "role": "assistant",
              "annotations": [],
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "call_yw18spRc1jjUlEyabbXBhB33",
                  "function": {
                    "arguments": "{\"city\":\"Tokyo\"}",
                    "name": "get_weather"
                  },
                  "type": "function"
                }
              ]
            },
            "content_filter_results": {}
          }
        ],
        "created": 1757499926,
        "model": "gpt-5-mini-2025-08-07",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 88,
          "prompt_tokens": 151,
          "total_tokens": 239,
          "completion_tokens_details": {
            "accepted_prediction_tokens": 0,
            "audio_tokens": 0,
            "reasoning_tokens": 64,
            "rejected_prediction_tokens": 0
          },
          "prompt_tokens_details": {
            "audio_tokens": 0,
            "cached_tokens": 0
          }
        },
        "prompt_filter_results": [
          {
            "prompt_index": 0,
            "content_filter_results": {}
          }
        ]
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/d3e27b7234e2.json
+++ b/tests/integration/recordings/responses/d3e27b7234e2.json
--- a/tests/integration/recordings/responses/fb785db7fafd.json
+++ b/tests/integration/recordings/responses/fb785db7fafd.json
@ -0,0 +1,310 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
        }
      ],
      "stream": true,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the weather in a given city",
            "parameters": {
              "type": "object",
              "properties": {
                "city": {
                  "type": "string",
                  "description": "The city to get the weather for"
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "",
          "choices": [],
          "created": 0,
          "model": "",
          "object": "",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null,
          "prompt_filter_results": [
            {
              "prompt_index": 0,
              "content_filter_results": {}
            }
          ]
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": [
                  {
                    "index": 0,
                    "id": "call_TMbEoYn9q0ZKtoxav5LpD9Ts",
                    "function": {
                      "arguments": "",
                      "name": "get_weather"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "{\"",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "city",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "\":\"",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "Tokyo",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "tool_calls",
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499912,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/recordings/responses/ff3271401fb4.json
+++ b/tests/integration/recordings/responses/ff3271401fb4.json
@ -0,0 +1,556 @@
 {
  "request": {
    "method": "POST",
    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-5-mini",
      "messages": [
        {
          "role": "user",
          "content": "What is the name of the US captial?"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-5-mini"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "",
          "choices": [],
          "created": 0,
          "model": "",
          "object": "",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null,
          "prompt_filter_results": [
            {
              "prompt_index": 0,
              "content_filter_results": {}
            }
          ]
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": "",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": "The",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " capital",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " of",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " the",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " United",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " States",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " is",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " Washington",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": ",",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " D",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": ".C",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": ".",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " (",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": "District",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " of",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": " Columbia",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": ").",
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null,
              "content_filter_results": {}
            }
          ],
          "created": 1757499916,
          "model": "gpt-5-mini-2025-08-07",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": null,
          "usage": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/telemetry/test_openai_telemetry.py
+++ b/tests/integration/telemetry/test_openai_telemetry.py
@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id):
        traces = llama_stack_client.telemetry.query_traces(limit=10)
        if len(traces) >= 5:  # 5 OpenAI completion traces
            break
-        time.sleep(1)
+        time.sleep(0.1)
    if len(traces) < 5:
        pytest.fail(
            f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
        )
    # Wait for 5 seconds to ensure traces has completed logging
    time.sleep(5)
    yield
@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
    assert len(response.choices) > 0, "Response should have at least one choice"
    # Wait for telemetry to be recorded
-    time.sleep(3)
+    start_time = time.time()
-
+    while time.time() - start_time < 30:
    # Check that we have more traces now
        final_traces = llama_stack_client.telemetry.query_traces(limit=20)
        final_count = len(final_traces)
        if final_count > initial_count:
            break
        time.sleep(0.1)
    # Should have at least as many traces as before (might have more due to other activity)
    assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
        traces = llama_stack_client.telemetry.query_traces(limit=10)
        if len(traces) >= 4:
            break
-        time.sleep(1)
+        time.sleep(0.1)
    if len(traces) < 4:
        pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.")
    # Wait for 5 seconds to ensure traces has completed logging
    time.sleep(5)
    yield
--- a/tests/integration/telemetry/test_telemetry_metrics.py
+++ b/tests/integration/telemetry/test_telemetry_metrics.py
@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i
                break
        except Exception:
            pass
-        time.sleep(1)
+        time.sleep(0.1)
    # Wait additional time to ensure all metrics are processed
    time.sleep(5)
    # Return the token lists for use in tests
    return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
--- a/tests/integration/tool_runtime/test_rag_tool.py
+++ b/tests/integration/tool_runtime/test_rag_tool.py
@ -183,6 +183,110 @@ def test_vector_db_insert_from_url_and_query(
    assert any("llama2" in chunk.content.lower() for chunk in response2.chunks)
 def test_rag_tool_openai_apis(client_with_empty_registry, embedding_model_id, embedding_dimension):
    vector_db_id = "test_openai_vector_db"
    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=embedding_dimension,
    )
    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    actual_vector_db_id = available_vector_dbs[0]
    # different document formats that should work with OpenAI APIs
    documents = [
        Document(
            document_id="text-doc",
            content="This is a plain text document about machine learning algorithms.",
            metadata={"type": "text", "category": "AI"},
        ),
        Document(
            document_id="url-doc",
            content="https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst",
            mime_type="text/plain",
            metadata={"type": "url", "source": "pytorch"},
        ),
        Document(
            document_id="data-url-doc",
            content="data:text/plain;base64,VGhpcyBpcyBhIGRhdGEgVVJMIGRvY3VtZW50IGFib3V0IGRlZXAgbGVhcm5pbmcu",  # "This is a data URL document about deep learning."
            metadata={"type": "data_url", "encoding": "base64"},
        ),
    ]
    client_with_empty_registry.tool_runtime.rag_tool.insert(
        documents=documents,
        vector_db_id=actual_vector_db_id,
        chunk_size_in_tokens=256,
    )
    files_list = client_with_empty_registry.files.list()
    assert len(files_list.data) >= len(documents), (
        f"Expected at least {len(documents)} files, got {len(files_list.data)}"
    )
    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
        vector_store_id=actual_vector_db_id
    )
    assert len(vector_store_files.data) >= len(documents), f"Expected at least {len(documents)} files in vector store"
    response = client_with_empty_registry.tool_runtime.rag_tool.query(
        vector_db_ids=[actual_vector_db_id],
        content="Tell me about machine learning and deep learning",
    )
    assert_valid_text_response(response)
    content_text = " ".join([chunk.text for chunk in response.content]).lower()
    assert "machine learning" in content_text or "deep learning" in content_text
 def test_rag_tool_exception_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
    vector_db_id = "test_exception_handling"
    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=embedding_dimension,
    )
    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    actual_vector_db_id = available_vector_dbs[0]
    documents = [
        Document(
            document_id="valid-doc",
            content="This is a valid document that should be processed successfully.",
            metadata={"status": "valid"},
        ),
        Document(
            document_id="invalid-url-doc",
            content="https://nonexistent-domain-12345.com/invalid.txt",
            metadata={"status": "invalid_url"},
        ),
        Document(
            document_id="another-valid-doc",
            content="This is another valid document for testing resilience.",
            metadata={"status": "valid"},
        ),
    ]
    client_with_empty_registry.tool_runtime.rag_tool.insert(
        documents=documents,
        vector_db_id=actual_vector_db_id,
        chunk_size_in_tokens=256,
    )
    response = client_with_empty_registry.tool_runtime.rag_tool.query(
        vector_db_ids=[actual_vector_db_id],
        content="valid document",
    )
    assert_valid_text_response(response)
    content_text = " ".join([chunk.text for chunk in response.content]).lower()
    assert "valid document" in content_text
 def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension):
    providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"]
    assert len(providers) > 0
@ -249,3 +353,107 @@ def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_i
                "chunk_template": "This should raise a ValueError because it is missing the proper template variables",
            },
        )
 def test_rag_tool_query_generation(client_with_empty_registry, embedding_model_id, embedding_dimension):
    vector_db_id = "test_query_generation_db"
    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=embedding_dimension,
    )
    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    actual_vector_db_id = available_vector_dbs[0]
    documents = [
        Document(
            document_id="ai-doc",
            content="Artificial intelligence and machine learning are transforming technology.",
            metadata={"category": "AI"},
        ),
        Document(
            document_id="banana-doc",
            content="Don't bring a banana to a knife fight.",
            metadata={"category": "wisdom"},
        ),
    ]
    client_with_empty_registry.tool_runtime.rag_tool.insert(
        documents=documents,
        vector_db_id=actual_vector_db_id,
        chunk_size_in_tokens=256,
    )
    response = client_with_empty_registry.tool_runtime.rag_tool.query(
        vector_db_ids=[actual_vector_db_id],
        content="Tell me about AI",
    )
    assert_valid_text_response(response)
    content_text = " ".join([chunk.text for chunk in response.content]).lower()
    assert "artificial intelligence" in content_text or "machine learning" in content_text
 def test_rag_tool_pdf_data_url_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
    vector_db_id = "test_pdf_data_url_db"
    client_with_empty_registry.vector_dbs.register(
        vector_db_id=vector_db_id,
        embedding_model=embedding_model_id,
        embedding_dimension=embedding_dimension,
    )
    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
    actual_vector_db_id = available_vector_dbs[0]
    sample_pdf = b"%PDF-1.3\n3 0 obj\n<</Type /Page\n/Parent 1 0 R\n/Resources 2 0 R\n/Contents 4 0 R>>\nendobj\n4 0 obj\n<</Filter /FlateDecode /Length 115>>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<</Type /Pages\n/Kids [3 0 R ]\n/Count 1\n/MediaBox [0 0 595.28 841.89]\n>>\nendobj\n5 0 obj\n<</Type /Font\n/BaseFont /Helvetica\n/Subtype /Type1\n/Encoding /WinAnsiEncoding\n>>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Llama Stack Developers)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n"
    import base64
    pdf_base64 = base64.b64encode(sample_pdf).decode("utf-8")
    pdf_data_url = f"data:application/pdf;base64,{pdf_base64}"
    documents = [
        Document(
            document_id="test-pdf-data-url",
            content=pdf_data_url,
            metadata={"type": "pdf", "source": "data_url"},
        ),
    ]
    client_with_empty_registry.tool_runtime.rag_tool.insert(
        documents=documents,
        vector_db_id=actual_vector_db_id,
        chunk_size_in_tokens=256,
    )
    files_list = client_with_empty_registry.files.list()
    assert len(files_list.data) >= 1, "PDF should have been uploaded to Files API"
    pdf_file = None
    for file in files_list.data:
        if file.filename and "test-pdf-data-url" in file.filename:
            pdf_file = file
            break
    assert pdf_file is not None, "PDF file should be found in Files API"
    assert pdf_file.bytes == len(sample_pdf), f"File size should match original PDF ({len(sample_pdf)} bytes)"
    file_content = client_with_empty_registry.files.retrieve_content(pdf_file.id)
    assert file_content.startswith(b"%PDF-"), "Retrieved file should be a valid PDF"
    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
        vector_store_id=actual_vector_db_id
    )
    assert len(vector_store_files.data) >= 1, "PDF should be attached to vector store"
    response = client_with_empty_registry.tool_runtime.rag_tool.query(
        vector_db_ids=[actual_vector_db_id],
        content="sample title",
    )
    assert_valid_text_response(response)
    content_text = " ".join([chunk.text for chunk in response.content]).lower()
    assert "sample title" in content_text or "title" in content_text
--- a/tests/unit/distribution/test_inference_recordings.py
+++ b/tests/unit/distribution/test_inference_recordings.py
@ -6,16 +6,18 @@
 import tempfile
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import AsyncMock, Mock, patch
 import pytest
-from openai import AsyncOpenAI
+from openai import NOT_GIVEN, AsyncOpenAI
 from openai.types.model import Model as OpenAIModel
 # Import the real Pydantic response types instead of using Mocks
 from llama_stack.apis.inference import (
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChoice,
    OpenAICompletion,
    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
@ -153,24 +155,22 @@ class TestInferenceRecording:
    async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response):
        """Test that recording mode captures and stores responses."""
        async def mock_create(*args, **kwargs):
            return real_openai_chat_response
        temp_storage_dir = temp_storage_dir / "test_recording_mode"
        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
            response = await client.chat.completions.create(
                model="llama3.2:3b",
                messages=[{"role": "user", "content": "Hello, how are you?"}],
                temperature=0.7,
                max_tokens=50,
                user=NOT_GIVEN,
            )
            # Verify the response was returned correctly
            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
            client.chat.completions._post.assert_called_once()
        # Verify recording was stored
        storage = ResponseStorage(temp_storage_dir)
@ -178,27 +178,25 @@ class TestInferenceRecording:
    async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response):
        """Test that replay mode returns stored responses without making real calls."""
        async def mock_create(*args, **kwargs):
            return real_openai_chat_response
        temp_storage_dir = temp_storage_dir / "test_replay_mode"
        # First, record a response
        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
            response = await client.chat.completions.create(
                model="llama3.2:3b",
                messages=[{"role": "user", "content": "Hello, how are you?"}],
                temperature=0.7,
                max_tokens=50,
                user=NOT_GIVEN,
            )
            client.chat.completions._post.assert_called_once()
        # Now test replay mode - should not call the original method
        with patch("openai.resources.chat.completions.AsyncCompletions.create") as mock_create_patch:
        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
            response = await client.chat.completions.create(
                model="llama3.2:3b",
@ -211,7 +209,43 @@ class TestInferenceRecording:
            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
            # Verify the original method was NOT called
-                mock_create_patch.assert_not_called()
+            client.chat.completions._post.assert_not_called()
    async def test_replay_mode_models(self, temp_storage_dir):
        """Test that replay mode returns stored responses without making real model listing calls."""
        async def _async_iterator(models):
            for model in models:
                yield model
        models = [
            OpenAIModel(id="foo", created=1, object="model", owned_by="test"),
            OpenAIModel(id="bar", created=2, object="model", owned_by="test"),
        ]
        expected_ids = {m.id for m in models}
        temp_storage_dir = temp_storage_dir / "test_replay_mode_models"
        # baseline - mock works without recording
        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
        client.models._get_api_list = Mock(return_value=_async_iterator(models))
        assert {m.id async for m in client.models.list()} == expected_ids
        client.models._get_api_list.assert_called_once()
        # record the call
        with inference_recording(mode=InferenceMode.RECORD, storage_dir=temp_storage_dir):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.models._get_api_list = Mock(return_value=_async_iterator(models))
            assert {m.id async for m in client.models.list()} == expected_ids
            client.models._get_api_list.assert_called_once()
        # replay the call
        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=temp_storage_dir):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.models._get_api_list = Mock(return_value=_async_iterator(models))
            assert {m.id async for m in client.models.list()} == expected_ids
            client.models._get_api_list.assert_not_called()
    async def test_replay_missing_recording(self, temp_storage_dir):
        """Test that replay mode fails when no recording is found."""
@ -228,28 +262,42 @@ class TestInferenceRecording:
    async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response):
        """Test recording and replay of embeddings calls."""
-        async def mock_create(*args, **kwargs):
+        # baseline - mock works without recording
-            return real_embeddings_response
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
        client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
        response = await client.embeddings.create(
            model=real_embeddings_response.model,
            input=["Hello world", "Test embedding"],
            encoding_format=NOT_GIVEN,
        )
        assert len(response.data) == 2
        assert response.data[0].embedding == [0.1, 0.2, 0.3]
        client.embeddings._post.assert_called_once()
        temp_storage_dir = temp_storage_dir / "test_embeddings_recording"
        # Record
        with patch("openai.resources.embeddings.AsyncEmbeddings.create", side_effect=mock_create):
        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
            response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
+                model=real_embeddings_response.model,
                input=["Hello world", "Test embedding"],
                encoding_format=NOT_GIVEN,
                dimensions=NOT_GIVEN,
                user=NOT_GIVEN,
            )
            assert len(response.data) == 2
        # Replay
        with patch("openai.resources.embeddings.AsyncEmbeddings.create") as mock_create_patch:
        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
            response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
+                model=real_embeddings_response.model,
                input=["Hello world", "Test embedding"],
            )
            # Verify we got the recorded response
@ -257,7 +305,67 @@ class TestInferenceRecording:
            assert response.data[0].embedding == [0.1, 0.2, 0.3]
            # Verify original method was not called
-                mock_create_patch.assert_not_called()
+            client.embeddings._post.assert_not_called()
    async def test_completions_recording(self, temp_storage_dir):
        real_completions_response = OpenAICompletion(
            id="test_completion",
            object="text_completion",
            created=1234567890,
            model="llama3.2:3b",
            choices=[
                {
                    "text": "Hello! I'm doing well, thank you for asking.",
                    "index": 0,
                    "logprobs": None,
                    "finish_reason": "stop",
                }
            ],
        )
        temp_storage_dir = temp_storage_dir / "test_completions_recording"
        # baseline - mock works without recording
        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
        client.completions._post = AsyncMock(return_value=real_completions_response)
        response = await client.completions.create(
            model=real_completions_response.model,
            prompt="Hello, how are you?",
            temperature=0.7,
            max_tokens=50,
            user=NOT_GIVEN,
        )
        assert response.choices[0].text == real_completions_response.choices[0].text
        client.completions._post.assert_called_once()
        # Record
        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.completions._post = AsyncMock(return_value=real_completions_response)
            response = await client.completions.create(
                model=real_completions_response.model,
                prompt="Hello, how are you?",
                temperature=0.7,
                max_tokens=50,
                user=NOT_GIVEN,
            )
            assert response.choices[0].text == real_completions_response.choices[0].text
            client.completions._post.assert_called_once()
        # Replay
        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
            client.completions._post = AsyncMock(return_value=real_completions_response)
            response = await client.completions.create(
                model=real_completions_response.model,
                prompt="Hello, how are you?",
                temperature=0.7,
                max_tokens=50,
            )
            assert response.choices[0].text == real_completions_response.choices[0].text
            client.completions._post.assert_not_called()
    async def test_live_mode(self, real_openai_chat_response):
        """Test that live mode passes through to original methods."""
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -6,19 +6,15 @@
 import asyncio
 import json
 import logging  # allow-direct-logging
 import threading
 import time
-from http.server import BaseHTTPRequestHandler, HTTPServer
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
-    Choice as OpenAIChoice,
+    Choice as OpenAIChoiceChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
    ChoiceDelta as OpenAIChoiceDelta,
@ -35,6 +31,9 @@ from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponseEventType,
    CompletionMessage,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChoice,
    SystemMessage,
    ToolChoice,
    ToolConfig,
@ -61,41 +60,6 @@ from llama_stack.providers.remote.inference.vllm.vllm import (
 # -v -s --tb=short --disable-warnings
 class MockInferenceAdapterWithSleep:
    def __init__(self, sleep_time: int, response: dict[str, Any]):
        self.httpd = None
        class DelayedRequestHandler(BaseHTTPRequestHandler):
            # ruff: noqa: N802
            def do_POST(self):
                time.sleep(sleep_time)
                response_body = json.dumps(response).encode("utf-8")
                self.send_response(code=200)
                self.send_header("Content-Type", "application/json")
                self.send_header("Content-Length", len(response_body))
                self.end_headers()
                self.wfile.write(response_body)
        self.request_handler = DelayedRequestHandler
    def __enter__(self):
        httpd = HTTPServer(("", 0), self.request_handler)
        self.httpd = httpd
        host, port = httpd.server_address
        httpd_thread = threading.Thread(target=httpd.serve_forever)
        httpd_thread.daemon = True  # stop server if this thread terminates
        httpd_thread.start()
        config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}")
        inference_adapter = VLLMInferenceAdapter(config)
        return inference_adapter
    def __exit__(self, _exc_type, _exc_value, _traceback):
        if self.httpd:
            self.httpd.shutdown()
            self.httpd.server_close()
@pytest.fixture(scope="module")
 def mock_openai_models_list():
    with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list:
@ -150,10 +114,12 @@ async def test_tool_call_response(vllm_inference_adapter):
    """Verify that tool call arguments from a CompletionMessage are correctly converted
    into the expected JSON format."""
-    # Patch the call to vllm so we can inspect the arguments sent were correct
+    # Patch the client property to avoid instantiating a real AsyncOpenAI client
-    with patch.object(
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
-        vllm_inference_adapter.client.chat.completions, "create", new_callable=AsyncMock
+        mock_client = MagicMock()
-    ) as mock_nonstream_completion:
+        mock_client.chat.completions.create = AsyncMock()
        mock_create_client.return_value = mock_client
        messages = [
            SystemMessage(content="You are a helpful assistant"),
            UserMessage(content="How many?"),
@ -179,7 +145,7 @@ async def test_tool_call_response(vllm_inference_adapter):
            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
        )
-        assert mock_nonstream_completion.call_args.kwargs["messages"][2]["tool_calls"] == [
+        assert mock_client.chat.completions.create.call_args.kwargs["messages"][2]["tool_calls"] == [
            {
                "id": "foo",
                "type": "function",
@ -199,7 +165,7 @@ async def test_tool_call_delta_empty_tool_call_buf():
    async def mock_stream():
        delta = OpenAIChoiceDelta(content="", tool_calls=None)
-        choices = [OpenAIChoice(delta=delta, finish_reason="stop", index=0)]
+        choices = [OpenAIChoiceChunk(delta=delta, finish_reason="stop", index=0)]
        mock_chunk = OpenAIChatCompletionChunk(
            id="chunk-1",
            created=1,
@ -225,7 +191,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -250,7 +216,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -275,7 +241,9 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
                )
            ],
        )
        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -299,7 +267,7 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -324,7 +292,7 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -349,7 +317,9 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
                )
            ],
        )
        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -393,59 +363,6 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
    assert chunks[0].event.event_type.value == "start"
@pytest.mark.allow_network
 def test_chat_completion_doesnt_block_event_loop(caplog):
    loop = asyncio.new_event_loop()
    loop.set_debug(True)
    caplog.set_level(logging.WARNING)
    # Log when event loop is blocked for more than 200ms
    loop.slow_callback_duration = 0.5
    # Sleep for 500ms in our delayed http response
    sleep_time = 0.5
    mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference")
    mock_response = {
        "id": "chatcmpl-abc123",
        "object": "chat.completion",
        "created": 1,
        "modle": "mock-model",
        "choices": [
            {
                "message": {"content": ""},
                "logprobs": None,
                "finish_reason": "stop",
                "index": 0,
            }
        ],
    }
    async def do_chat_completion():
        await inference_adapter.chat_completion(
            "mock-model",
            [],
            stream=False,
            tools=None,
            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
        )
    with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter:
        inference_adapter.model_store = AsyncMock()
        inference_adapter.model_store.get_model.return_value = mock_model
        loop.run_until_complete(inference_adapter.initialize())
        # Clear the logs so far and run the actual chat completion we care about
        caplog.clear()
        loop.run_until_complete(do_chat_completion())
    # Ensure we don't have any asyncio warnings in the captured log
    # records from our chat completion call. A message gets logged
    # here any time we exceed the slow_callback_duration configured
    # above.
    asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
    assert not asyncio_warnings
 async def test_get_params_empty_tools(vllm_inference_adapter):
    request = ChatCompletionRequest(
        tools=[],
@ -641,9 +558,7 @@ async def test_health_status_success(vllm_inference_adapter):
    This test verifies that the health method returns a HealthResponse with status OK, only
    when the connection to the vLLM server is successful.
    """
-    # Set vllm_inference_adapter.client to None to ensure _create_client is called
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
    vllm_inference_adapter.client = None
    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
        # Create mock client and models
        mock_client = MagicMock()
        mock_models = MagicMock()
@ -674,8 +589,7 @@ async def test_health_status_failure(vllm_inference_adapter):
    This test verifies that the health method returns a HealthResponse with status ERROR
    and an appropriate error message when the connection to the vLLM server fails.
    """
-    vllm_inference_adapter.client = None
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
        # Create mock client and models
        mock_client = MagicMock()
        mock_models = MagicMock()
@ -697,3 +611,48 @@ async def test_health_status_failure(vllm_inference_adapter):
        assert "Health check failed: Connection failed" in health_response["message"]
        mock_models.list.assert_called_once()
 async def test_openai_chat_completion_is_async(vllm_inference_adapter):
    """
    Verify that openai_chat_completion is async and doesn't block the event loop.
    To do this we mock the underlying inference with a sleep, start multiple
    inference calls in parallel, and ensure the total time taken is less
    than the sum of the individual sleep times.
    """
    sleep_time = 0.5
    async def mock_create(*args, **kwargs):
        await asyncio.sleep(sleep_time)
        return OpenAIChatCompletion(
            id="chatcmpl-abc123",
            created=1,
            model="mock-model",
            choices=[
                OpenAIChoice(
                    message=OpenAIAssistantMessageParam(
                        content="nothing interesting",
                    ),
                    finish_reason="stop",
                    index=0,
                )
            ],
        )
    async def do_inference():
        await vllm_inference_adapter.openai_chat_completion(
            "mock-model", messages=["one fish", "two fish"], stream=False
        )
    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
        mock_client = MagicMock()
        mock_client.chat.completions.create = AsyncMock(side_effect=mock_create)
        mock_create_client.return_value = mock_client
        start_time = time.time()
        await asyncio.gather(do_inference(), do_inference(), do_inference(), do_inference())
        total_time = time.time() - start_time
        assert mock_create_client.call_count == 4  # no cheating
        assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"
--- a/tests/unit/providers/test_bedrock.py
+++ b/tests/unit/providers/test_bedrock.py
@ -0,0 +1,53 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.remote.inference.bedrock.bedrock import (
    _get_region_prefix,
    _to_inference_profile_id,
 )
 def test_region_prefixes():
    assert _get_region_prefix("us-east-1") == "us."
    assert _get_region_prefix("eu-west-1") == "eu."
    assert _get_region_prefix("ap-south-1") == "ap."
    assert _get_region_prefix("ca-central-1") == "us."
    # Test case insensitive
    assert _get_region_prefix("US-EAST-1") == "us."
    assert _get_region_prefix("EU-WEST-1") == "eu."
    assert _get_region_prefix("Ap-South-1") == "ap."
    # Test None region
    assert _get_region_prefix(None) == "us."
 def test_model_id_conversion():
    # Basic conversion
    assert (
        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "us-east-1") == "us.meta.llama3-1-70b-instruct-v1:0"
    )
    # Already has prefix
    assert (
        _to_inference_profile_id("us.meta.llama3-1-70b-instruct-v1:0", "us-east-1")
        == "us.meta.llama3-1-70b-instruct-v1:0"
    )
    # ARN should be returned unchanged
    arn = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/us.meta.llama3-1-70b-instruct-v1:0"
    assert _to_inference_profile_id(arn, "us-east-1") == arn
    # ARN should be returned unchanged even without region
    assert _to_inference_profile_id(arn) == arn
    # Optional region parameter defaults to us-east-1
    assert _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0") == "us.meta.llama3-1-70b-instruct-v1:0"
    # Different regions work with optional parameter
    assert (
        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "eu-west-1") == "eu.meta.llama3-1-70b-instruct-v1:0"
    )
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@ -178,3 +178,41 @@ def test_content_from_data_and_mime_type_both_encodings_fail():
        # Should raise an exception instead of returning empty string
        with pytest.raises(UnicodeDecodeError):
            content_from_data_and_mime_type(data, mime_type)
 async def test_memory_tool_error_handling():
    """Test that memory tool handles various failures gracefully without crashing."""
    from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig
    from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
    config = RagToolRuntimeConfig()
    memory_tool = MemoryToolRuntimeImpl(
        config=config,
        vector_io_api=AsyncMock(),
        inference_api=AsyncMock(),
        files_api=AsyncMock(),
    )
    docs = [
        RAGDocument(document_id="good_doc", content="Good content", metadata={}),
        RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}),
        RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}),
    ]
    mock_file1 = MagicMock()
    mock_file1.id = "file_good1"
    mock_file2 = MagicMock()
    mock_file2.id = "file_good2"
    memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2]
    with patch("httpx.AsyncClient") as mock_client:
        mock_instance = AsyncMock()
        mock_instance.get.side_effect = Exception("Bad URL")
        mock_client.return_value.__aenter__.return_value = mock_instance
        # won't raise exception despite one document failing
        await memory_tool.insert(docs, "vector_store_123")
    # processed 2 documents successfully, skipped 1
    assert memory_tool.files_api.openai_upload_file.call_count == 2
    assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2
--- a/tests/unit/providers/vector_io/test_vector_utils.py
+++ b/tests/unit/providers/vector_io/test_vector_utils.py
@ -26,9 +26,9 @@ def test_generate_chunk_id():
    chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
    assert chunk_ids == [
-        "177a1368-f6a8-0c50-6e92-18677f2c3de3",
+        "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
-        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
+        "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
-        "f68df25d-d9aa-ab4d-5684-64a233add20d",
+        "d14f75a1-5855-7f72-2c78-d9fc4275a346",
    ]
@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window():
    chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
    chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
    chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
-    assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
+    assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
-    assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
+    assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
 def test_chunk_id():
    # Test with existing chunk ID
    chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
-    assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
+    assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
    # Test with document ID in metadata
    chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
--- a/tests/unit/utils/inference/test_inference_store.py
+++ b/tests/unit/utils/inference/test_inference_store.py
@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)
        # Wait for all queued writes to complete
        await store.flush()
        # Test 1: First page with limit=2, descending order (default)
        result = await store.list_chat_completions(limit=2, order=Order.desc)
        assert len(result.data) == 2
@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)
        # Wait for all queued writes to complete
        await store.flush()
        # Test ascending order pagination
        result = await store.list_chat_completions(limit=1, order=Order.asc)
        assert len(result.data) == 1
@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)
        # Wait for all queued writes to complete
        await store.flush()
        # Test pagination with model filter
        result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc)
        assert len(result.data) == 1
@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)
        # Wait for all queued writes to complete
        await store.flush()
        # Test without limit
        result = await store.list_chat_completions(order=Order.desc)
        assert len(result.data) == 2
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = ">=3.12"
 resolution-markers = [
    "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1839,7 +1839,6 @@ test = [
    { name = "datasets" },
    { name = "mcp" },
    { name = "milvus-lite" },
    { name = "openai" },
    { name = "psycopg2-binary" },
    { name = "pymilvus" },
    { name = "pypdf" },
@ -1865,7 +1864,6 @@ unit = [
    { name = "milvus-lite" },
    { name = "moto", extra = ["s3"] },
    { name = "ollama" },
    { name = "openai" },
    { name = "psycopg2-binary" },
    { name = "pymilvus" },
    { name = "pypdf" },
@ -1889,7 +1887,7 @@ requires-dist = [
    { name = "jsonschema" },
    { name = "llama-stack-client", specifier = ">=0.2.21" },
    { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.21" },
-    { name = "openai", specifier = ">=1.99.6" },
+    { name = "openai", specifier = ">=1.100.0" },
    { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
    { name = "opentelemetry-sdk", specifier = ">=1.30.0" },
    { name = "pandas", marker = "extra == 'ui'" },
@ -1959,7 +1957,6 @@ test = [
    { name = "datasets", specifier = ">=4.0.0" },
    { name = "mcp" },
    { name = "milvus-lite", specifier = ">=2.5.0" },
    { name = "openai", specifier = ">=1.100.0" },
    { name = "psycopg2-binary", specifier = ">=2.9.0" },
    { name = "pymilvus", specifier = ">=2.6.1" },
    { name = "pypdf" },
@ -1984,7 +1981,6 @@ unit = [
    { name = "milvus-lite", specifier = ">=2.5.0" },
    { name = "moto", extras = ["s3"], specifier = ">=5.1.10" },
    { name = "ollama" },
    { name = "openai" },
    { name = "psycopg2-binary", specifier = ">=2.9.0" },
    { name = "pymilvus", specifier = ">=2.6.1" },
    { name = "pypdf" },
@ -2023,7 +2019,7 @@ wheels = [
 [[package]]
 name = "locust"
-version = "2.39.1"
+version = "2.40.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "configargparse" },
@ -2035,6 +2031,7 @@ dependencies = [
    { name = "locust-cloud" },
    { name = "msgpack" },
    { name = "psutil" },
    { name = "pytest" },
    { name = "python-engineio" },
    { name = "python-socketio", extra = ["client"] },
    { name = "pywin32", marker = "sys_platform == 'win32'" },
@ -2043,9 +2040,9 @@ dependencies = [
    { name = "setuptools" },
    { name = "werkzeug" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/95/c8/10aa5445c404eed389b56877e6714c1787190cc09dd70059ce3765979ec5/locust-2.39.1.tar.gz", hash = "sha256:6bdd19e27edf9a1c84391d6cf6e9a737dfb832be7dfbf39053191ae31b9cc498", size = 1409902, upload-time = "2025-08-29T17:41:01.544Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/22/82f40176473a98c9479bed667d3ad21bb859d2cb67f6880a6b0b6a725e45/locust-2.40.1.tar.gz", hash = "sha256:5bde76c1cf7e412071670f926f34844e119210c93f07a4cf9fc4cb93c60a578a", size = 1411606, upload-time = "2025-09-05T15:57:35.76Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/b3/b2f4b2ca88b1e72eba7be2b2982533b887f8b709d222db78eb9602aa5121/locust-2.39.1-py3-none-any.whl", hash = "sha256:fd5148f2f1a4ed34aee968abc4393674e69d1b5e1b54db50a397f6eb09ce0b04", size = 1428155, upload-time = "2025-08-29T17:41:00.245Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/e6/9c6335ab16becf4f8ad3da6083ab78793c56ec1ca496d6f7c74660c21c3f/locust-2.40.1-py3-none-any.whl", hash = "sha256:ef0517f9bb5ed0afa7035014faaf944802917e07da8649461aaaf5e5f3ba8a65", size = 1430154, upload-time = "2025-09-05T15:57:33.233Z" },
 ]
 [[package]]
@ -2619,7 +2616,7 @@ wheels = [
 [[package]]
 name = "openai"
-version = "1.102.0"
+version = "1.107.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "anyio" },
@ -2631,9 +2628,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/55/da5598ed5c6bdd9939633854049cddc5cbac0da938dfcfcb3c6b119c16c0/openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9", size = 519027, upload-time = "2025-08-26T20:50:29.397Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/0d/c9e7016d82c53c5b5e23e2bad36daebb8921ed44f69c0a985c6529a35106/openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345", size = 812015, upload-time = "2025-08-26T20:50:27.219Z" },
+    { url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" },
 ]
 [[package]]
@ -3540,7 +3537,7 @@ wheels = [
 [[package]]
 name = "pytest"
-version = "8.4.1"
+version = "8.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
@ -3549,9 +3546,9 @@ dependencies = [
    { name = "pluggy" },
    { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
 ]
 [[package]]