Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-31 00:03:52 +00:00 · 2025-04-01 07:57:21 -05:00 · 2025-04-01 07:57:21 -05:00 · 9c9f9577e2
commit 9c9f9577e2
parent 8783dd8162 19f504e9e2
173 changed files with 3073 additions and 3118 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722
+* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -25,7 +25,8 @@ jobs:
      matrix:
        # Listing tests manually since some of them currently fail
        # TODO: generate matrix list from tests/integration when fixed
-        test-type: [inference, datasets, inspect, scoring, post_training, providers]
+        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
+        client-type: [library, http]
      fail-fast: false # we want to run all tests regardless of failure

    steps:
@ -54,6 +55,8 @@ jobs:
          uv sync --extra dev --extra test
          uv pip install ollama faiss-cpu
          # always test against the latest version of the client
+          # TODO: this is not necessarily a good idea. we need to test against both published and latest
+          # to find out backwards compatibility issues.
          uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
          uv pip install -e .
          llama stack build --template ollama --image-type venv
@ -74,6 +77,7 @@ jobs:
          exit 1

      - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
@ -81,6 +85,7 @@ jobs:
          nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
@ -98,4 +103,12 @@ jobs:
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="ollama"
+          else
+            stack_config="http://localhost:8321"
+          fi
+          uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+            --embedding-model=all-MiniLM-L6-v2
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,76 @@
 # Changelog

+# v0.1.8
+Published on: 2025-03-24T01:28:50Z
+
+# v0.1.8 Release Notes
+
+### Build and Test Agents
+* Safety: Integrated NVIDIA as a safety provider.
+* VectorDB: Added Qdrant as an inline provider.
+* Agents: Added support for multiple tool groups in agents.
+* Agents: Simplified imports for Agents in client package
+
+
+### Agent Evals and Model Customization
+* Introduced DocVQA and IfEval benchmarks.
+
+### Deploying and Monitoring Agents
+* Introduced a Containerfile and image workflow for the Playground.
+* Implemented support for Bearer (API Key) authentication.
+* Added attribute-based access control for resources.
+* Fixes on docker deployments: use --pull always and standardized the default port to 8321
+* Deprecated: /v1/inspect/providers use /v1/providers/ instead
+
+### Better Engineering
+* Consolidated scripts under the ./scripts directory.
+* Addressed mypy violations in various modules.
+* Added Dependabot scans for Python dependencies.
+* Implemented a scheduled workflow to update the changelog automatically.
+* Enforced concurrency to reduce CI loads.
+
+
+### New Contributors
+* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
+* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
+* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
+* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
+
+**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
+
+---
+
+# v0.1.7
+Published on: 2025-03-14T22:30:51Z
+
+## 0.1.7 Release Notes
+
+###  Build and Test Agents
+* Inference: ImageType is now refactored to LlamaStackImageType
+* Inference: Added tests to measure TTFT
+* Inference: Bring back usage metrics
+* Agents: Added endpoint for get agent, list agents and list sessions
+* Agents: Automated conversion of type hints in client tool for lite llm format
+* Agents: Deprecated ToolResponseMessage in agent.resume API
+* Added Provider API for listing and inspecting provider info
+
+### Agent Evals and Model Customization
+* Eval: Added new eval benchmarks Math 500 and BFCL v3
+* Deploy and Monitoring of Agents
+* Telemetry: Fix tracing to work across coroutines
+
+###  Better Engineering
+* Display code coverage for unit tests
+* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
+* Unit tests also run on Python 3.11, 3.12, and 3.13
+* Added ollama inference to Integration tests CI
+* Improved documentation across examples, testing, CLI, updated providers table )
+
+
+
+
+---
+
 # v0.1.6
 Published on: 2025-03-08T04:35:08Z

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -81,12 +81,14 @@ Note that you can create a dotenv file `.env` that includes necessary environmen
 LLAMA_STACK_BASE_URL=http://localhost:8321
 LLAMA_STACK_CLIENT_LOG=debug
 LLAMA_STACK_PORT=8321
-LLAMA_STACK_CONFIG=
+LLAMA_STACK_CONFIG=<provider-name>
+TAVILY_SEARCH_API_KEY=
+BRAVE_SEARCH_API_KEY=
 ```

 And then use this dotenv file when running client SDK tests via the following:
 ```bash
-uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
+uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```

 ## Pre-commit Hooks
@ -124,6 +126,10 @@ source .venv/bin/activate
 PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
 ```

+## Running integration tests
+
+You can run integration tests following the instructions [here](tests/integration/README.md).
+
 ## Adding a new dependency to the project

 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,5 @@
 include pyproject.toml
-include distributions/dependencies.json
+include llama_stack/templates/dependencies.json
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -1,15 +0,0 @@
-services:
-  llamastack:
-    image: distribution-bedrock
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-bedrock.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-cerebras
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-cerebras.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -1,50 +0,0 @@
-services:
-  text-generation-inference:
-    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
-    network_mode: "host"
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    ports:
-      - "5009:5009"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
-      - NUM_SHARD=4
-      - MAX_BATCH_PREFILL_TOKENS=32768
-      - MAX_INPUT_TOKENS=8000
-      - MAX_TOTAL_TOKENS=8192
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: all
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      text-generation-inference:
-        condition: service_healthy
-    image: llamastack/distribution-tgi
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to TGI run.yaml file
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,44 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:80
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-fireworks
-    ports:
-      - "8321:8321"
-    environment:
-      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -1,34 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-quantized-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,58 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: meta0
-    provider_type: inline::meta-reference-quantized
-    config:
-      model: Llama3.2-3B-Instruct:int4-qlora-eo8
-      quantization:
-        type: int4
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  - provider_id: meta1
-    provider_type: inline::meta-reference-quantized
-    config:
-      # not a quantized model !
-      model: Llama-Guard-3-1B
-      quantization: null
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -1,71 +0,0 @@
-services:
-  ollama:
-    image: ollama/ollama:latest
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.ollama:/root/.ollama
-    ports:
-      - "11434:11434"
-    environment:
-      OLLAMA_DEBUG: 1
-    command: []
-    deploy:
-      resources:
-        limits:
-          memory: 8G    # Set maximum memory
-        reservations:
-          memory: 8G    # Set minimum memory reservation
-    # healthcheck:
-    #   # ugh, no CURL in ollama image
-    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
-    #   interval: 10s
-    #   timeout: 5s
-    #   retries: 5
-
-  ollama-init:
-    image: ollama/ollama:latest
-    depends_on:
-      - ollama
-        # condition: service_healthy
-    network_mode: ${NETWORK_MODE:-bridge}
-    environment:
-      - OLLAMA_HOST=ollama
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-    volumes:
-      - ~/.ollama:/root/.ollama
-      - ./pull-models.sh:/pull-models.sh
-    entrypoint: ["/pull-models.sh"]
-
-  llamastack:
-    depends_on:
-      ollama:
-        condition: service_started
-      ollama-init:
-        condition: service_started
-    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to ollama run.yaml file
-      - ~/local/llama-stack/:/app/llama-stack-source
-      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-      - OLLAMA_URL=http://ollama:11434
-    entrypoint: >
-        python -m llama_stack.distribution.server.server /root/my-run.yaml \
-        --port ${LLAMA_STACK_PORT:-8321}
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 10s
-        max_attempts: 3
-        window: 60s
-volumes:
-  ollama:
-  ollama-init:
-  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
-for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
-  echo "Preloading $model..."
-  if ! ollama run "$model"; then
-    echo "Failed to pull and run $model"
-    exit 1
-  fi
-done
-
-echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run.yaml
--- a/distributions/ramalama/faiss_store.db
+++ b/distributions/ramalama/faiss_store.db
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -1,19 +0,0 @@
-services:
-  llamastack:
-    image: distribution-nvidia:dev
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-nvidia.yaml
-    ports:
-      - "8321:8321"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,99 +0,0 @@
-services:
-  vllm-inference:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_INFERENCE_PORT:-5100}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  # A little trick:
-  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
-  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
-  vllm-${VLLM_SAFETY_MODEL:+safety}:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_SAFETY_MODEL}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_SAFETY_PORT:-5101}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      - vllm-inference:
-          condition: service_healthy
-      - vllm-${VLLM_SAFETY_MODEL:+safety}:
-          condition: service_healthy
-    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
-    network_mode: ${NETWORK_MODE:-bridged}
-    environment:
-      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
-      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - MAX_TOKENS=${MAX_TOKENS:-4096}
-      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    # Hack: wait for vLLM server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-volumes:
-  vllm-inference:
-  vllm-safety:
-  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -1,9 +0,0 @@
-name: runpod
-distribution_spec:
-  description: Use Runpod for running LLM inference
-  providers:
-    inference: remote::runpod
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-sambanova
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-sambanova.yaml
-    ports:
-      - "5000:5000"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,103 +0,0 @@
-services:
-  tgi-inference:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --port ${TGI_INFERENCE_PORT:-8080}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  tgi-${TGI_SAFETY_MODEL:+safety}:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-      --port ${TGI_SAFETY_PORT:-8081}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  llamastack:
-    depends_on:
-      tgi-inference:
-        condition: service_healthy
-      tgi-${TGI_SAFETY_MODEL:+safety}:
-        condition: service_healthy
-    image: llamastack/distribution-tgi:test-0.0.52rc3
-    network_mode: ${NETWORK_MODE:-bridged}
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
-    environment:
-      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
-      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-
-volumes:
-  tgi-inference:
-  tgi-safety:
-  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-together
-    ports:
-      - "8321:8321"
-    environment:
-      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-inline-vllm
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -1,66 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: inline::vllm
-    config:
-      model: Llama3.2-3B-Instruct
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.4
-      enforce_eager: true
-      max_tokens: 4096
-  - provider_id: vllm-inference-safety
-    provider_type: inline::vllm
-    config:
-      model: Llama-Guard-3-1B
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.2
-      enforce_eager: true
-      max_tokens: 4096
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  # Uncomment to use prompt guard
-  # - provider_id: meta1
-  #   provider_type: inline::prompt-guard
-  #   config:
-  #     model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  # Uncomment to use pgvector
-  # - provider_id: pgvector
-  #   provider_type: remote::pgvector
-  #   config:
-  #     host: 127.0.0.1
-  #     port: 5432
-  #     db: postgres
-  #     user: postgres
-  #     password: mysecretpassword
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/agents_store.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -818,14 +818,7 @@
            "delete": {
                "responses": {
                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/FileResponse"
-                                }
-                            }
-                        }
+                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
@ -2122,7 +2115,7 @@
                        "content": {
                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/IterrowsResponse"
+                                    "$ref": "#/components/schemas/PaginatedResponse"
                                }
                            }
                        }
@ -2143,7 +2136,7 @@
                "tags": [
                    "DatasetIO"
                ],
-                "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
+                "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
                "parameters": [
                    {
                        "name": "dataset_id",
@ -2695,9 +2688,9 @@
                    "200": {
                        "description": "OK",
                        "content": {
-                            "application/jsonl": {
+                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/ToolDef"
+                                    "$ref": "#/components/schemas/ListToolDefsResponse"
                                }
                            }
                        }
@ -4053,22 +4046,33 @@
                "type": "object",
                "properties": {
                    "strategy": {
-                        "$ref": "#/components/schemas/SamplingStrategy"
+                        "$ref": "#/components/schemas/SamplingStrategy",
+                        "description": "The sampling strategy."
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0
+                        "default": 0,
+                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
                        "type": "number",
-                        "default": 1.0
+                        "default": 1.0,
+                        "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
+                    },
+                    "stop": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "strategy"
                ],
-                "title": "SamplingParams"
+                "title": "SamplingParams",
+                "description": "Sampling parameters."
            },
            "SamplingStrategy": {
                "oneOf": [
@ -6129,46 +6133,6 @@
                "title": "FileUploadResponse",
                "description": "Response after initiating a file upload session."
            },
-            "FileResponse": {
-                "type": "object",
-                "properties": {
-                    "bucket": {
-                        "type": "string",
-                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
-                    },
-                    "key": {
-                        "type": "string",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
-                    },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "MIME type of the file"
-                    },
-                    "url": {
-                        "type": "string",
-                        "description": "Upload URL for the file contents"
-                    },
-                    "bytes": {
-                        "type": "integer",
-                        "description": "Size of the file in bytes"
-                    },
-                    "created_at": {
-                        "type": "integer",
-                        "description": "Timestamp of when the file was created"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "bucket",
-                    "key",
-                    "mime_type",
-                    "url",
-                    "bytes",
-                    "created_at"
-                ],
-                "title": "FileResponse",
-                "description": "Response representing a file entry."
-            },
            "EmbeddingsRequest": {
                "type": "object",
                "properties": {
@ -6922,6 +6886,46 @@
                "title": "URIDataSource",
                "description": "A dataset that can be obtained from a URI."
            },
+            "FileResponse": {
+                "type": "object",
+                "properties": {
+                    "bucket": {
+                        "type": "string",
+                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
+                    },
+                    "key": {
+                        "type": "string",
+                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
+                    },
+                    "mime_type": {
+                        "type": "string",
+                        "description": "MIME type of the file"
+                    },
+                    "url": {
+                        "type": "string",
+                        "description": "Upload URL for the file contents"
+                    },
+                    "bytes": {
+                        "type": "integer",
+                        "description": "Size of the file in bytes"
+                    },
+                    "created_at": {
+                        "type": "integer",
+                        "description": "Timestamp of when the file was created"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "bucket",
+                    "key",
+                    "mime_type",
+                    "url",
+                    "bytes",
+                    "created_at"
+                ],
+                "title": "FileResponse",
+                "description": "Response representing a file entry."
+            },
            "Model": {
                "type": "object",
                "properties": {
@ -7660,7 +7664,8 @@
                            "completed",
                            "in_progress",
                            "failed",
-                            "scheduled"
+                            "scheduled",
+                            "cancelled"
                        ],
                        "title": "JobStatus"
                    },
@ -8068,7 +8073,7 @@
                "additionalProperties": false,
                "title": "ToolInvocationResult"
            },
-            "IterrowsResponse": {
+            "PaginatedResponse": {
                "type": "object",
                "properties": {
                    "data": {
@ -8098,19 +8103,20 @@
                                ]
                            }
                        },
-                        "description": "The rows in the current page."
+                        "description": "The list of items for the current page"
                    },
-                    "next_start_index": {
-                        "type": "integer",
-                        "description": "Index into dataset for the first row in the next page. None if there are no more rows."
+                    "has_more": {
+                        "type": "boolean",
+                        "description": "Whether there are more items available after this set"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "data"
+                    "data",
+                    "has_more"
                ],
-                "title": "IterrowsResponse",
-                "description": "A paginated list of rows from a dataset."
+                "title": "PaginatedResponse",
+                "description": "A generic paginated response that follows a simple format."
            },
            "Job": {
                "type": "object",
@ -8124,7 +8130,8 @@
                            "completed",
                            "in_progress",
                            "failed",
-                            "scheduled"
+                            "scheduled",
+                            "cancelled"
                        ],
                        "title": "JobStatus"
                    }
@ -8321,6 +8328,22 @@
                ],
                "title": "ListRoutesResponse"
            },
+            "ListToolDefsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDef"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListToolDefsResponse"
+            },
            "ListScoringFunctionsResponse": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -557,10 +557,6 @@ paths:
      responses:
        '200':
          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/FileResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -1447,7 +1443,7 @@ paths:
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/IterrowsResponse'
+                $ref: '#/components/schemas/PaginatedResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -1461,7 +1457,20 @@ paths:
      tags:
        - DatasetIO
      description: >-
-        Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+        Get a paginated list of rows from a dataset.
+
+        Uses offset-based pagination where:
+
+        - start_index: The starting index (0-based). If None, starts from beginning.
+
+        - limit: Number of items to return. If None or -1, returns all items.
+
+
+        The response includes:
+
+        - data: List of items for the current page
+
+        - has_more: Whether there are more items available after this set
      parameters:
        - name: dataset_id
          in: path
@ -1846,9 +1855,9 @@ paths:
        '200':
          description: OK
          content:
-            application/jsonl:
+            application/json:
              schema:
-                $ref: '#/components/schemas/ToolDef'
+                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -2787,16 +2796,33 @@ components:
      properties:
        strategy:
          $ref: '#/components/schemas/SamplingStrategy'
+          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
+          description: >-
+            The maximum number of tokens that can be generated in the completion.
+            The token count of your prompt plus max_tokens cannot exceed the model's
+            context length.
        repetition_penalty:
          type: number
          default: 1.0
+          description: >-
+            Number between -2.0 and 2.0. Positive values penalize new tokens based
+            on whether they appear in the text so far, increasing the model's likelihood
+            to talk about new topics.
+        stop:
+          type: array
+          items:
+            type: string
+          description: >-
+            Up to 4 sequences where the API will stop generating further tokens. The
+            returned text will not contain the stop sequence.
      additionalProperties: false
      required:
        - strategy
      title: SamplingParams
+      description: Sampling parameters.
    SamplingStrategy:
      oneOf:
        - $ref: '#/components/schemas/GreedySamplingStrategy'
@ -4269,39 +4295,6 @@ components:
      title: FileUploadResponse
      description: >-
        Response after initiating a file upload session.
-    FileResponse:
-      type: object
-      properties:
-        bucket:
-          type: string
-          description: >-
-            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-        key:
-          type: string
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-        mime_type:
-          type: string
-          description: MIME type of the file
-        url:
-          type: string
-          description: Upload URL for the file contents
-        bytes:
-          type: integer
-          description: Size of the file in bytes
-        created_at:
-          type: integer
-          description: Timestamp of when the file was created
-      additionalProperties: false
-      required:
-        - bucket
-        - key
-        - mime_type
-        - url
-        - bytes
-        - created_at
-      title: FileResponse
-      description: Response representing a file entry.
    EmbeddingsRequest:
      type: object
      properties:
@ -4813,6 +4806,39 @@ components:
      title: URIDataSource
      description: >-
        A dataset that can be obtained from a URI.
+    FileResponse:
+      type: object
+      properties:
+        bucket:
+          type: string
+          description: >-
+            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
+        key:
+          type: string
+          description: >-
+            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        mime_type:
+          type: string
+          description: MIME type of the file
+        url:
+          type: string
+          description: Upload URL for the file contents
+        bytes:
+          type: integer
+          description: Size of the file in bytes
+        created_at:
+          type: integer
+          description: Timestamp of when the file was created
+      additionalProperties: false
+      required:
+        - bucket
+        - key
+        - mime_type
+        - url
+        - bytes
+        - created_at
+      title: FileResponse
+      description: Response representing a file entry.
    Model:
      type: object
      properties:
@ -5289,6 +5315,7 @@ components:
            - in_progress
            - failed
            - scheduled
+            - cancelled
          title: JobStatus
        scheduled_at:
          type: string
@ -5528,7 +5555,7 @@ components:
              - type: object
      additionalProperties: false
      title: ToolInvocationResult
-    IterrowsResponse:
+    PaginatedResponse:
      type: object
      properties:
        data:
@ -5543,17 +5570,18 @@ components:
                - type: string
                - type: array
                - type: object
-          description: The rows in the current page.
-        next_start_index:
-          type: integer
+          description: The list of items for the current page
+        has_more:
+          type: boolean
          description: >-
-            Index into dataset for the first row in the next page. None if there are
-            no more rows.
+            Whether there are more items available after this set
      additionalProperties: false
      required:
        - data
-      title: IterrowsResponse
-      description: A paginated list of rows from a dataset.
+        - has_more
+      title: PaginatedResponse
+      description: >-
+        A generic paginated response that follows a simple format.
    Job:
      type: object
      properties:
@ -5566,6 +5594,7 @@ components:
            - in_progress
            - failed
            - scheduled
+            - cancelled
          title: JobStatus
      additionalProperties: false
      required:
@ -5703,6 +5732,17 @@ components:
      required:
        - data
      title: ListRoutesResponse
+    ListToolDefsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDef'
+      additionalProperties: false
+      required:
+        - data
+      title: ListToolDefsResponse
    ListScoringFunctionsResponse:
      type: object
      properties:
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -963,16 +963,19 @@
        "\n",
        "client.benchmarks.register(\n",
        "    benchmark_id=\"meta-reference::mmmu\",\n",
+        "    # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
+        "    # `input_rows` argument and does not fetch data from the dataset.\n",
        "    dataset_id=f\"mmmu-{subset}-{split}\",\n",
-        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
+        "    # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
+        "    scoring_functions=[],\n",
        ")\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::mmmu\",\n",
        "    input_rows=eval_rows,\n",
+        "    # Note: Here we define the actual scoring functions.\n",
        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
        "            \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1139,12 +1142,11 @@
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        ")\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.data,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
        "            \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1288,12 +1290,11 @@
        "    \"enable_session_persistence\": False,\n",
        "}\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.data,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"agent\",\n",
        "            \"config\": agent_config,\n",
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack  # noqa: E402

 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
-from .pyopenapi.utility import Specification, validate_api_method_return_types  # noqa: E402
+from .pyopenapi.utility import Specification, validate_api  # noqa: E402


 def str_presenter(dumper, data):
@ -40,8 +40,7 @@ def main(output_dir: str):
        raise ValueError(f"Directory {output_dir} does not exist")

    # Validate API protocols before generating spec
-    print("Validating API method return types...")
-    return_type_errors = validate_api_method_return_types()
+    return_type_errors = validate_api()
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -7,10 +7,9 @@
 import json
 import typing
 import inspect
-import os
 from pathlib import Path
 from typing import TextIO
-from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
+from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args

 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
 from llama_stack.distribution.resolver import api_protocol_map
@ -125,29 +124,89 @@ def is_optional_type(type_: Any) -> bool:
    return origin is Optional or (origin is Union and type(None) in args)


-def validate_api_method_return_types() -> List[str]:
-    """Validate that all API methods have proper return types."""
-    errors = []
-    protocols = api_protocol_map()
-
-    for protocol_name, protocol in protocols.items():
-        methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
-
-        for method_name, method in methods:
-            if not hasattr(method, '__webmethod__'):
-                continue
-
-            # Only check GET methods
-            if method.__webmethod__.method != "GET":
-                continue
-
+def _validate_api_method_return_type(method) -> str | None:
    hints = get_type_hints(method)

    if 'return' not in hints:
-                errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
-            else:
+        return "has no return type annotation"
+
    return_type = hints['return']
    if is_optional_type(return_type):
-                    errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
+        return "returns Optional type where a return value is mandatory"
+
+
+def _validate_api_method_doesnt_return_list(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if get_origin(return_type) is list:
+        return "returns a list where a PaginatedResponse or List*Response object is expected"
+
+
+def _validate_api_delete_method_returns_none(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if return_type is not None and return_type is not type(None):
+        return "does not return None where None is mandatory"
+
+
+def _validate_list_parameters_contain_data(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if not inspect.isclass(return_type):
+        return
+
+    if not return_type.__name__.startswith('List'):
+        return
+
+    if 'data' not in return_type.model_fields:
+        return "does not have a mandatory data attribute containing the list of objects"
+
+
+_VALIDATORS = {
+    "GET": [
+        _validate_api_method_return_type,
+        _validate_list_parameters_contain_data,
+        _validate_api_method_doesnt_return_list,
+    ],
+    "DELETE": [
+        _validate_api_delete_method_returns_none,
+    ],
+}
+
+
+def _get_methods_by_type(protocol, method_type: str):
+    members = inspect.getmembers(protocol, predicate=inspect.isfunction)
+    return {
+        method_name: method
+        for method_name, method in members
+        if (webmethod := getattr(method, '__webmethod__', None))
+        if webmethod and webmethod.method == method_type
+    }
+
+
+def validate_api() -> List[str]:
+    """Validate the API protocols."""
+    errors = []
+    protocols = api_protocol_map()
+
+    for target, validators in _VALIDATORS.items():
+        for protocol_name, protocol in protocols.items():
+            for validator in validators:
+                for method_name, method in _get_methods_by_type(protocol, target).items():
+                    err = validator(method)
+                    if err:
+                        errors.append(f"Method {protocol_name}.{method_name} {err}")

    return errors
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications
+# Building AI Applications (Examples)

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,4 +1,4 @@
-## Using Retrieval Augmented Generation (RAG)
+## Retrieval Augmented Generation (RAG)

 RAG enables your applications to reference and recall information from previous interactions or external documents.

--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -45,14 +45,16 @@ Here's an example that sends telemetry signals to all three sink types. Your con
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      sinks: ['console', 'sqlite', 'otel']
-      otel_endpoint: "http://localhost:4318/v1/traces"
+      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      otel_trace_endpoint: "http://localhost:4318/v1/traces"
+      otel_metric_endpoint: "http://localhost:4318/v1/metrics"
      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Jaeger to visualize traces

-The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
+The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
+Let's use Jaeger to visualize this data.

 Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -16,6 +16,7 @@ from docutils import nodes
 from pathlib import Path
 import requests
 import json
+from datetime import datetime

 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
@ -28,7 +29,7 @@ with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") a
    llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"

 project = "llama-stack"
-copyright = "2025, Meta"
+copyright = f"{datetime.now().year}, Meta"
 author = "Meta"

 # -- General configuration ---------------------------------------------------
@ -37,6 +38,7 @@ author = "Meta"
 extensions = [
    "myst_parser",
    "sphinx_rtd_theme",
+    "sphinx_rtd_dark_mode",
    "sphinx_copybutton",
    "sphinx_tabs.tabs",
    "sphinx_design",
@ -103,6 +105,8 @@ source_suffix = {
 # html_theme = "alabaster"
 html_theme_options = {
    "canonical_url": "https://github.com/meta-llama/llama-stack",
+    'collapse_navigation': False,
+
    # "style_nav_header_background": "#c3c9d4",
 }

--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -1,14 +1,14 @@
-# Contributing to Llama Stack

-Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
+```{include} ../../../CONTRIBUTING.md
+```
+
+See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
+

- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.

 ```{toctree}
 :maxdepth: 1
 :hidden:

 new_api_provider
-testing
 ```
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -67,7 +67,7 @@ options:
                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
                        conda)
  --image-name IMAGE_NAME
-                        [for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
+                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,4 +1,4 @@
-# Configuring a Stack
+# Configuring a "Stack"

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,10 +1,12 @@
 # Using Llama Stack as a Library

-If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
+## Setup Llama Stack without a Server
+If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
+This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template together --image-type venv
+llama stack build --template ollama --image-type venv
 ```

 ```python
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -1,34 +1,18 @@
-# Starting a Llama Stack Server
+# Distributions Overview

-You can run a Llama Stack server in one of the following ways:
-
-**As a Library**:
-
-This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
-
-
-**Container**:
-
-Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
-
-
-**Conda**:
-
-If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
-
-
-**Kubernetes**:
-
-If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
+A distribution is a pre-packaged set of Llama Stack components that can be deployed together.

+This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
-:maxdepth: 1
-:hidden:
+:maxdepth: 3

 importing_as_library
-building_distro
 configuration
-selection
+list_of_distributions
 kubernetes_deployment
+building_distro
+on_device_distro
+remote_hosted_distro
+self_hosted_distro
 ```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,6 +1,9 @@
 # Kubernetes Deployment Guide

-Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
+
+### Prerequisites
+In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.

 First, create a local Kubernetes cluster via Kind:

@ -8,7 +11,7 @@ First, create a local Kubernetes cluster via Kind:
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```

-Start vLLM server as a Kubernetes Pod and Service:
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:

 ```bash
 cat <<EOF |kubectl apply -f -
@ -31,7 +34,13 @@ metadata:
 type: Opaque
 data:
  token: $(HF_TOKEN)
---
+```
+
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@ -47,28 +56,23 @@ spec:
        app.kubernetes.io/name: vllm
    spec:
      containers:
-      - name: llama-stack
-        image: $(VLLM_IMAGE)
-        command:
-            - bash
-            - -c
-            - |
-              MODEL="meta-llama/Llama-3.2-1B-Instruct"
-              MODEL_PATH=/app/model/$(basename $MODEL)
-              huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
-              huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
-              python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
-        ports:
-          - containerPort: 8000
-        volumeMounts:
-          - name: llama-storage
-            mountPath: /app/model
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
        env:
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-token-secret
              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
@ -127,6 +131,7 @@ EOF
 podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
 ```

+### Deploying Llama Stack Server in Kubernetes

 We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:

@ -187,6 +192,7 @@ spec:
 EOF
 ```

+### Verifying the Deployment
 We can check that the LlamaStack server has started:

 ```bash
--- a/docs/source/distributions/list_of_distributions.md
+++ b/docs/source/distributions/list_of_distributions.md
@ -1,4 +1,4 @@
-# List of Distributions
+# Available List of Distributions

 Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.

--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -9,6 +9,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | datasetio | `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::nvidia` |
+| post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
 | telemetry | `inline::meta-reference` |
@ -21,6 +22,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 The following environment variables can be configured:

 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
+- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
+- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
+- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
+- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
+- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
 - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -98,11 +98,14 @@ export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export LLAMA_STACK_PORT=8321

+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
 docker run \
-  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
@ -121,7 +124,6 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 cd /path/to/llama-stack

 docker run \
-  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -0,0 +1,32 @@
+# Starting a Llama Stack Server
+
+You can run a Llama Stack server in one of the following ways:
+
+**As a Library**:
+
+This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
+
+
+**Container**:
+
+Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
+
+
+**Conda**:
+
+If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
+**Kubernetes**:
+
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
+
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+importing_as_library
+configuration
+kubernetes_deployment
+```
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,10 +1,11 @@
 # Quick Start

-In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.
+In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to build a simple [RAG (Retrieval Augmented Generation)](../building_applications/rag.md) agent.

 A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.

 In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
+Ollama is an LLM runtime that allows you to run Llama models locally.


 ### 1. Start Ollama
@ -24,7 +25,7 @@ If you do not have ollama, you can install it from [here](https://ollama.com/dow

 ### 2. Pick a client environment

-Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
+Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through a REST interface. You can interact with the Stack in two ways:

 * Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
 * Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -6,6 +6,7 @@ Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_v

 # Llama Stack

+## What is Llama Stack?

 Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides

@ -22,6 +23,12 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge

 Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.

+## How does Llama Stack work?
+Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and [client SDKs](#available-sdks) meant to
+be used in your applications. The server can be run in a variety of environments, including local (inline)
+development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
+Kotlin.
+
 ## Quick Links

 - New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
@ -93,7 +100,6 @@ getting_started/index
 concepts/index
 providers/index
 distributions/index
-distributions/selection
 building_applications/index
 playground/index
 contributing/index
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@ -92,8 +92,6 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie

 ## Starting the Llama Stack Playground

-### Llama CLI
-
 To start the Llama Stack Playground, run the following commands:

 1. Start up the Llama Stack API server
@ -109,29 +107,3 @@ cd llama_stack/distribution/ui
 pip install -r requirements.txt
 streamlit run app.py
 ```
-
-### Docker
-
-Playground can also be started in a docker image:
-
-```sh
-export LLAMA_STACK_URL=http://localhost:11434
-
-docker run \
-  --pull always \
-  -p 8501:8501 \
-  -e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
-  quay.io/jland/llama-stack-playground
-```
-
-## Configurable Environment Variables
-
-## Environment Variables
-
-| Environment Variable       | Description                        | Default Value             |
-|----------------------------|------------------------------------|---------------------------|
-| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
-| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
-| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
-| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
-| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@ -10,11 +10,57 @@ That means you're not limited to storing vectors in memory or in a separate serv
 ## Features

 - Lightweight and easy to use
- Fully integrated with Llama Stack
+- Fully integrated with Llama Stacks
+- Uses disk-based storage for persistence, allowing for larger vector storage
+
+### Comparison to Faiss
+
+The choice between Faiss and sqlite-vec should be made based on the needs of your application,
+as they have different strengths.
+
+#### Choosing the Right Provider
+
+Scenario | Recommended Tool | Reason
+-- |-----------------| --
+Online Analytical Processing (OLAP) | Faiss           | Fast, in-memory searches
+Online Transaction Processing (OLTP) | sqlite-vec      | Frequent writes and reads
+Frequent writes | sqlite-vec      | Efficient disk-based storage and incremental indexing
+Large datasets | sqlite-vec      | Disk-based storage for larger vector storage
+Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
+
+#### Empirical Example
+
+Consider the histogram below in which 10,000 randomly generated strings were inserted
+in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+
+```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss write times
+:width: 400px
+```
+
+You will notice that the average write time for `sqlite-vec` was 788ms, compared to
+47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
+uniformly spread across the [1500, 100000] interval.
+
+Looking at each individual write in the order that the documents are inserted you'll see the increase in
+write speed as Faiss reindexes the vectors after each write.
+```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss write times
+:width: 400px
+```
+
+In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
+The modes of the two distributions highlight the differences much further where Faiss
+will likely yield faster read performance.
+
+```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss read times
+:width: 400px
+```

 ## Usage

-To use SQLite-Vec in your Llama Stack project, follow these steps:
+To use sqlite-vec in your Llama Stack project, follow these steps:

 1. Install the necessary dependencies.
 2. Configure your Llama Stack project to use SQLite-Vec.
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@ -15,6 +15,7 @@ class JobStatus(Enum):
    in_progress = "in_progress"
    failed = "failed"
    scheduled = "scheduled"
+    cancelled = "cancelled"


@json_schema_type
--- a/llama_stack/apis/common/responses.py
+++ b/llama_stack/apis/common/responses.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class PaginatedResponse(BaseModel):
+    """A generic paginated response that follows a simple format.
+
+    :param data: The list of items for the current page
+    :param has_more: Whether there are more items available after this set
+    """
+
+    data: List[Dict[str, Any]]
+    has_more: bool
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -6,23 +6,9 @@

 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable

-from pydantic import BaseModel
-
+from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class IterrowsResponse(BaseModel):
-    """
-    A paginated list of rows from a dataset.
-
-    :param data: The rows in the current page.
-    :param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
-    """
-
-    data: List[Dict[str, Any]]
-    next_start_index: Optional[int] = None
+from llama_stack.schema_utils import webmethod


 class DatasetStore(Protocol):
@ -34,15 +20,22 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    # TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing
    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
    async def iterrows(
        self,
        dataset_id: str,
        start_index: Optional[int] = None,
        limit: Optional[int] = None,
-    ) -> IterrowsResponse:
-        """Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+    ) -> PaginatedResponse:
+        """Get a paginated list of rows from a dataset.
+
+        Uses offset-based pagination where:
+        - start_index: The starting index (0-based). If None, starts from beginning.
+        - limit: Number of items to return. If None or -1, returns all items.
+
+        The response includes:
+        - data: List of items for the current page
+        - has_more: Whether there are more items available after this set

        :param dataset_id: The ID of the dataset to get the rows from.
        :param start_index: Index into dataset for the first row to get. Get all rows if None.
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -34,6 +34,7 @@ class Api(Enum):
    scoring_functions = "scoring_functions"
    benchmarks = "benchmarks"
    tool_groups = "tool_groups"
+    files = "files"

    # built-in API
    inspect = "inspect"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -164,7 +164,7 @@ class Files(Protocol):
        self,
        bucket: str,
        key: str,
-    ) -> FileResponse:
+    ) -> None:
        """
        Delete a file identified by a bucket and key.

--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -88,6 +88,10 @@ class ListToolsResponse(BaseModel):
    data: List[Tool]


+class ListToolDefsResponse(BaseModel):
+    data: list[ToolDef]
+
+
@runtime_checkable
@trace_protocol
 class ToolGroups(Protocol):
@ -148,7 +152,7 @@ class ToolRuntime(Protocol):
    @webmethod(route="/tool-runtime/list-tools", method="GET")
    async def list_runtime_tools(
        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]: ...
+    ) -> ListToolDefsResponse: ...

    @webmethod(route="/tool-runtime/invoke", method="POST")
    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -21,6 +21,7 @@ from prompt_toolkit.completion import WordCompleter
 from prompt_toolkit.validation import Validator
 from termcolor import cprint

+from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.table import print_table
 from llama_stack.distribution.build import (
    SERVER_DEPENDENCIES,
@ -62,10 +63,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
    if args.list_templates:
        return _run_template_list_cmd()

-    if args.image_type == "venv":
+    if args.image_type == ImageType.VENV.value:
        current_venv = os.environ.get("VIRTUAL_ENV")
        image_name = args.image_name or current_venv
-    elif args.image_type == "conda":
+    elif args.image_type == ImageType.CONDA.value:
        current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
        image_name = args.image_name or current_conda_env
    else:
@ -84,7 +85,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
            build_config.image_type = args.image_type
        else:
            cprint(
-                f"Please specify a image-type (container | conda | venv) for {args.template}",
+                f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {args.template}",
                color="red",
            )
            sys.exit(1)
@ -98,15 +99,15 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        )

        image_type = prompt(
-            "> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ",
+            f"> Enter the image type you want your Llama Stack to be built as ({' or '.join(e.value for e in ImageType)}): ",
            validator=Validator.from_callable(
-                lambda x: x in ["container", "conda", "venv"],
-                error_message="Invalid image type, please enter conda or container or venv",
+                lambda x: x in [e.value for e in ImageType],
+                error_message=f"Invalid image type, please enter {' or '.join(e.value for e in ImageType)}",
            ),
-            default="conda",
+            default=ImageType.CONDA.value,
        )

-        if image_type == "conda":
+        if image_type == ImageType.CONDA.value:
            if not image_name:
                cprint(
                    f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
@ -136,6 +137,8 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        providers = dict()
        for api, providers_for_api in get_provider_registry().items():
            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
+            if not available_providers:
+                continue
            api_provider = prompt(
                "> Enter provider for API {}: ".format(api.value),
                completer=WordCompleter(available_providers),
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -6,6 +6,7 @@
 import argparse
 import textwrap

+from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand


@ -46,16 +47,16 @@ class StackBuild(Subcommand):
        self.parser.add_argument(
            "--image-type",
            type=str,
-            help="Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.",
-            choices=["conda", "container", "venv"],
-            default="conda",
+            help="Image Type to use for the build. If not specified, will use the image type from the template config.",
+            choices=[e.value for e in ImageType],
+            default=ImageType.CONDA.value,
        )

        self.parser.add_argument(
            "--image-name",
            type=str,
            help=textwrap.dedent(
-                """[for image-type=conda|venv] Name of the conda or virtual environment to use for
+                f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the conda or virtual environment to use for
 the build. If not specified, currently active Conda environment will be used if found.
            """
            ),
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -8,6 +8,7 @@ import argparse
 import os
 from pathlib import Path

+from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.log import get_logger

@ -56,7 +57,6 @@ class StackRun(Subcommand):
            "--env",
            action="append",
            help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
-            default=[],
            metavar="KEY=VALUE",
        )
        self.parser.add_argument(
@ -73,10 +73,24 @@ class StackRun(Subcommand):
            "--image-type",
            type=str,
            help="Image Type used during the build. This can be either conda or container or venv.",
-            choices=["conda", "container", "venv"],
-            default="conda",
+            choices=[e.value for e in ImageType],
        )

+    # If neither image type nor image name is provided, but at the same time
+    # the current environment has conda breadcrumbs, then assume what the user
+    # wants to use conda mode and not the usual default mode (using
+    # pre-installed system packages).
+    #
+    # Note: yes, this is hacky. It's implemented this way to keep the existing
+    # conda users unaffected by the switch of the default behavior to using
+    # system packages.
+    def _get_image_type_and_name(self, args: argparse.Namespace) -> tuple[str, str]:
+        conda_env = os.environ.get("CONDA_DEFAULT_ENV")
+        if conda_env and args.image_name == conda_env:
+            logger.warning(f"Conda detected. Using conda environment {conda_env} for the run.")
+            return ImageType.CONDA.value, args.image_name
+        return args.image_type, args.image_name
+
    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import yaml

@ -120,18 +134,42 @@ class StackRun(Subcommand):
        except AttributeError as e:
            self.parser.error(f"failed to parse config file '{config_file}':\n {e}")

-        run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
+        image_type, image_name = self._get_image_type_and_name(args)
+
+        # If neither image type nor image name is provided, assume the server should be run directly
+        # using the current environment packages.
+        if not image_type and not image_name:
+            logger.info("No image type or image name provided. Assuming environment packages.")
+            from llama_stack.distribution.server.server import main as server_main
+
+            # Build the server args from the current args passed to the CLI
+            server_args = argparse.Namespace()
+            for arg in vars(args):
+                # If this is a function, avoid passing it
+                # "args" contains:
+                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
+                if callable(getattr(args, arg)):
+                    continue
+                setattr(server_args, arg, getattr(args, arg))
+
+            # Run the server
+            server_main(server_args)
+        else:
+            run_args = formulate_run_args(image_type, image_name, config, template_name)

            run_args.extend([str(config_file), str(args.port)])
            if args.disable_ipv6:
                run_args.append("--disable-ipv6")

+            if args.env:
                for env_var in args.env:
                    if "=" not in env_var:
                        self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
+                        return
                    key, value = env_var.split("=", 1)  # split on first = only
                    if not key:
                        self.parser.error(f"Environment variable '{env_var}' has empty key")
+                        return
                    run_args.extend(["--env", f"{key}={value}"])

            if args.tls_keyfile and args.tls_certfile:
--- a/llama_stack/cli/stack/utils.py
+++ b/llama_stack/cli/stack/utils.py
@ -4,6 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from enum import Enum
+
+
+class ImageType(Enum):
+    CONDA = "conda"
+    CONTAINER = "container"
+    VENV = "venv"
+

 def print_subcommand_description(parser, subparsers):
    """Print descriptions of subcommands."""
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -328,8 +328,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        body = self._convert_body(path, options.method, body)

-        async def gen():
        await start_trace(route, {"__location__": "library_client"})
+
+        async def gen():
            try:
                async for chunk in await func(**body):
                    data = json.dumps(convert_pydantic_to_json_value(chunk))
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -12,6 +12,7 @@ from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
+from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
@ -79,6 +80,7 @@ def api_protocol_map() -> Dict[Api, Any]:
        Api.post_training: PostTraining,
        Api.tool_groups: ToolGroups,
        Api.tool_runtime: ToolRuntime,
+        Api.files: Files,
    }


--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -12,7 +12,8 @@ from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
 )
-from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import DatasetPurpose, DataSource
 from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
 from llama_stack.apis.inference import (
@ -45,11 +46,11 @@ from llama_stack.apis.scoring import (
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
 from llama_stack.apis.tools import (
+    ListToolDefsResponse,
    RAGDocument,
    RAGQueryConfig,
    RAGQueryResult,
    RAGToolRuntime,
-    ToolDef,
    ToolRuntime,
 )
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
@ -497,7 +498,7 @@ class DatasetIORouter(DatasetIO):
        dataset_id: str,
        start_index: Optional[int] = None,
        limit: Optional[int] = None,
-    ) -> IterrowsResponse:
+    ) -> PaginatedResponse:
        logger.debug(
            f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
        )
@ -706,6 +707,6 @@ class ToolRuntimeRouter(ToolRuntime):

    async def list_runtime_tools(
        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]:
+    ) -> ListToolDefsResponse:
        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
        return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -568,7 +568,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
        tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution

-        for tool_def in tool_defs:
+        for tool_def in tool_defs.data:
            tools.append(
                ToolWithACL(
                    identifier=tool_def.name,
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -15,7 +15,7 @@ import warnings
 from contextlib import asynccontextmanager
 from importlib.metadata import version as parse_version
 from pathlib import Path
-from typing import Any, List, Union
+from typing import Any, List, Optional, Union

 import yaml
 from fastapi import Body, FastAPI, HTTPException, Request
@ -294,11 +294,17 @@ class ClientVersionMiddleware:
        return await self.app(scope, receive, send)


-def main():
+def main(args: Optional[argparse.Namespace] = None):
    """Start the LlamaStack server."""
    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
    parser.add_argument(
        "--yaml-config",
+        dest="config",
+        help="(Deprecated) Path to YAML configuration file - use --config instead",
+    )
+    parser.add_argument(
+        "--config",
+        dest="config",
        help="Path to YAML configuration file",
    )
    parser.add_argument(
@ -328,12 +334,24 @@ def main():
        required="--tls-keyfile" in sys.argv,
    )

+    # Determine whether the server args are being passed by the "run" command, if this is the case
+    # the args will be passed as a Namespace object to the main function, otherwise they will be
+    # parsed from the command line
+    if args is None:
        args = parser.parse_args()

+    # Check for deprecated argument usage
+    if "--yaml-config" in sys.argv:
+        warnings.warn(
+            "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
    log_line = ""
-    if args.yaml_config:
+    if args.config:
        # if the user provided a config file, use it, even if template was specified
-        config_file = Path(args.yaml_config)
+        config_file = Path(args.config)
        if not config_file.exists():
            raise ValueError(f"Config file {config_file} does not exist")
        log_line = f"Using config file: {config_file}"
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@ -13,6 +13,7 @@ LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 PYPI_VERSION=${PYPI_VERSION:-}
+VIRTUAL_ENV=${VIRTUAL_ENV:-}

 set -euo pipefail

@ -69,10 +70,12 @@ while [[ $# -gt 0 ]]; do
    ;;
  esac
 done
-
 PYTHON_BINARY="python"
 case "$env_type" in
  "venv")
+    if [ -n "$VIRTUAL_ENV" && "$VIRTUAL_ENV" == "$env_path_or_name" ]; then
+        echo -e "${GREEN}Virtual environment already activated${NC}" >&2
+    else
        # Activate virtual environment
        if [ ! -d "$env_path_or_name" ]; then
            echo -e "${RED}Error: Virtual environment not found at $env_path_or_name${NC}" >&2
@ -85,6 +88,7 @@ case "$env_type" in
        fi

        source "$env_path_or_name/bin/activate"
+    fi
    ;;
  "conda")
    if ! is_command_available conda; then
--- a/llama_stack/distribution/ui/page/playground/rag.py
+++ b/llama_stack/distribution/ui/page/playground/rag.py
@ -58,6 +58,7 @@ def rag_chat_page():
                llama_stack_api.client.tool_runtime.rag_tool.insert(
                    vector_db_id=vector_db_name,  # Use the user-provided name
                    documents=documents,
+                    chunk_size_in_tokens=512,
                )
                st.success("Vector database created successfully!")

--- a/llama_stack/distribution/utils/context.py
+++ b/llama_stack/distribution/utils/context.py
@ -18,15 +18,19 @@ def preserve_contexts_async_generator(
    This is needed because we start a new asyncio event loop for each streaming request,
    and we need to preserve the context across the event loop boundary.
    """
+    # Capture initial context values
+    initial_context_values = {context_var.name: context_var.get() for context_var in context_vars}

    async def wrapper() -> AsyncGenerator[T, None]:
        while True:
            try:
-                item = await gen.__anext__()
-                context_values = {context_var.name: context_var.get() for context_var in context_vars}
-                yield item
+                # Restore context values before any await
                for context_var in context_vars:
-                    _ = context_var.set(context_values[context_var.name])
+                    context_var.set(initial_context_values[context_var.name])
+
+                item = await gen.__anext__()
+                yield item
+
            except StopAsyncIteration:
                break

--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@ -139,7 +139,7 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
        category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
        log_file (str): Path to a log file to additionally pipe the logs into
    """
-    log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s"
+    log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s"

    class CategoryFilter(logging.Filter):
        """Ensure category is always present in log records."""
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@ -195,10 +195,22 @@ register_schema(SamplingStrategy, name="SamplingStrategy")

@json_schema_type
 class SamplingParams(BaseModel):
+    """Sampling parameters.
+
+    :param strategy: The sampling strategy.
+    :param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of
+        your prompt plus max_tokens cannot exceed the model's context length.
+    :param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens
+        based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+    :param stop: Up to 4 sequences where the API will stop generating further tokens.
+        The returned text will not contain the stop sequence.
+    """
+
    strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)

    max_tokens: Optional[int] = 0
    repetition_penalty: Optional[float] = 1.0
+    stop: Optional[List[str]] = None


 class CheckpointQuantizationFormat(Enum):
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -57,11 +57,7 @@ from llama_stack.apis.inference import (
    UserMessage,
 )
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import (
-    ToolGroups,
-    ToolInvocationResult,
-    ToolRuntime,
-)
+from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
@ -459,7 +455,15 @@ class ChatAgent(ShieldRunnerMixin):
                contexts.append(raw_document_text)

            attached_context = "\n".join(contexts)
-            input_messages[-1].context = attached_context
+            if isinstance(input_messages[-1].content, str):
+                input_messages[-1].content += attached_context
+            elif isinstance(input_messages[-1].content, list):
+                input_messages[-1].content.append(TextContentItem(text=attached_context))
+            else:
+                input_messages[-1].content = [
+                    input_messages[-1].content,
+                    TextContentItem(text=attached_context),
+                ]

        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional

 import pandas

-from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl

@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
        dataset_id: str,
        start_index: Optional[int] = None,
        limit: Optional[int] = None,
-    ) -> IterrowsResponse:
+    ) -> PaginatedResponse:
        dataset_def = self.dataset_infos[dataset_id]
        dataset_impl = PandasDataframeDataset(dataset_def)
        await dataset_impl.load()

-        start_index = start_index or 0
-
-        if limit is None or limit == -1:
-            end = len(dataset_impl)
-        else:
-            end = min(start_index + limit, len(dataset_impl))
-
-        rows = dataset_impl[start_index:end]
-
-        return IterrowsResponse(
-            data=rows,
-            next_start_index=end if end < len(dataset_impl) else None,
-        )
+        records = dataset_impl.df.to_dict("records")
+        return paginate_records(records, start_index, limit)

    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
        dataset_def = self.dataset_infos[dataset_id]
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -28,6 +28,11 @@ class TelemetryConfig(BaseModel):
        default="http://localhost:4318/v1/metrics",
        description="The OpenTelemetry collector endpoint URL for metrics",
    )
+    service_name: str = Field(
+        # service name is always the same, use zero-width space to avoid clutter
+        default="",
+        description="The service name to use for telemetry",
+    )
    sinks: List[TelemetrySink] = Field(
        default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
        description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
@ -47,6 +52,7 @@ class TelemetryConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
+            "service_name": "${env.OTEL_SERVICE_NAME:}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
        }
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/meta-reference-gpu/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/nvidia/build.yaml`