Merge branch 'main' into feat/litellm_sambanova_usage

2025-12-28 07:02:00 +00:00 · 2025-04-01 07:57:21 -05:00 · 2025-04-01 07:57:21 -05:00 · 9c9f9577e2
commit 9c9f9577e2
parent 8783dd8162 19f504e9e2
173 changed files with 3073 additions and 3118 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -818,14 +818,7 @@
            "delete": {
                "responses": {
                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/FileResponse"
-                                }
-                            }
-                        }
+                        "description": "OK"
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
@ -2122,7 +2115,7 @@
                        "content": {
                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/IterrowsResponse"
+                                    "$ref": "#/components/schemas/PaginatedResponse"
                                }
                            }
                        }
@ -2143,7 +2136,7 @@
                "tags": [
                    "DatasetIO"
                ],
-                "description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
+                "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
                "parameters": [
                    {
                        "name": "dataset_id",
@ -2695,9 +2688,9 @@
                    "200": {
                        "description": "OK",
                        "content": {
-                            "application/jsonl": {
+                            "application/json": {
                                "schema": {
-                                    "$ref": "#/components/schemas/ToolDef"
+                                    "$ref": "#/components/schemas/ListToolDefsResponse"
                                }
                            }
                        }
@ -4053,22 +4046,33 @@
                "type": "object",
                "properties": {
                    "strategy": {
-                        "$ref": "#/components/schemas/SamplingStrategy"
+                        "$ref": "#/components/schemas/SamplingStrategy",
+                        "description": "The sampling strategy."
                    },
                    "max_tokens": {
                        "type": "integer",
-                        "default": 0
+                        "default": 0,
+                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
                        "type": "number",
-                        "default": 1.0
+                        "default": 1.0,
+                        "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
+                    },
+                    "stop": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "strategy"
                ],
-                "title": "SamplingParams"
+                "title": "SamplingParams",
+                "description": "Sampling parameters."
            },
            "SamplingStrategy": {
                "oneOf": [
@ -6129,46 +6133,6 @@
                "title": "FileUploadResponse",
                "description": "Response after initiating a file upload session."
            },
-            "FileResponse": {
-                "type": "object",
-                "properties": {
-                    "bucket": {
-                        "type": "string",
-                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
-                    },
-                    "key": {
-                        "type": "string",
-                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
-                    },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "MIME type of the file"
-                    },
-                    "url": {
-                        "type": "string",
-                        "description": "Upload URL for the file contents"
-                    },
-                    "bytes": {
-                        "type": "integer",
-                        "description": "Size of the file in bytes"
-                    },
-                    "created_at": {
-                        "type": "integer",
-                        "description": "Timestamp of when the file was created"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "bucket",
-                    "key",
-                    "mime_type",
-                    "url",
-                    "bytes",
-                    "created_at"
-                ],
-                "title": "FileResponse",
-                "description": "Response representing a file entry."
-            },
            "EmbeddingsRequest": {
                "type": "object",
                "properties": {
@ -6922,6 +6886,46 @@
                "title": "URIDataSource",
                "description": "A dataset that can be obtained from a URI."
            },
+            "FileResponse": {
+                "type": "object",
+                "properties": {
+                    "bucket": {
+                        "type": "string",
+                        "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
+                    },
+                    "key": {
+                        "type": "string",
+                        "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
+                    },
+                    "mime_type": {
+                        "type": "string",
+                        "description": "MIME type of the file"
+                    },
+                    "url": {
+                        "type": "string",
+                        "description": "Upload URL for the file contents"
+                    },
+                    "bytes": {
+                        "type": "integer",
+                        "description": "Size of the file in bytes"
+                    },
+                    "created_at": {
+                        "type": "integer",
+                        "description": "Timestamp of when the file was created"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "bucket",
+                    "key",
+                    "mime_type",
+                    "url",
+                    "bytes",
+                    "created_at"
+                ],
+                "title": "FileResponse",
+                "description": "Response representing a file entry."
+            },
            "Model": {
                "type": "object",
                "properties": {
@ -7660,7 +7664,8 @@
                            "completed",
                            "in_progress",
                            "failed",
-                            "scheduled"
+                            "scheduled",
+                            "cancelled"
                        ],
                        "title": "JobStatus"
                    },
@ -8068,7 +8073,7 @@
                "additionalProperties": false,
                "title": "ToolInvocationResult"
            },
-            "IterrowsResponse": {
+            "PaginatedResponse": {
                "type": "object",
                "properties": {
                    "data": {
@ -8098,19 +8103,20 @@
                                ]
                            }
                        },
-                        "description": "The rows in the current page."
+                        "description": "The list of items for the current page"
                    },
-                    "next_start_index": {
-                        "type": "integer",
-                        "description": "Index into dataset for the first row in the next page. None if there are no more rows."
+                    "has_more": {
+                        "type": "boolean",
+                        "description": "Whether there are more items available after this set"
                    }
                },
                "additionalProperties": false,
                "required": [
-                    "data"
+                    "data",
+                    "has_more"
                ],
-                "title": "IterrowsResponse",
-                "description": "A paginated list of rows from a dataset."
+                "title": "PaginatedResponse",
+                "description": "A generic paginated response that follows a simple format."
            },
            "Job": {
                "type": "object",
@ -8124,7 +8130,8 @@
                            "completed",
                            "in_progress",
                            "failed",
-                            "scheduled"
+                            "scheduled",
+                            "cancelled"
                        ],
                        "title": "JobStatus"
                    }
@ -8321,6 +8328,22 @@
                ],
                "title": "ListRoutesResponse"
            },
+            "ListToolDefsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolDef"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListToolDefsResponse"
+            },
            "ListScoringFunctionsResponse": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -557,10 +557,6 @@ paths:
      responses:
        '200':
          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/FileResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -1447,7 +1443,7 @@ paths:
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/IterrowsResponse'
+                $ref: '#/components/schemas/PaginatedResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -1461,7 +1457,20 @@ paths:
      tags:
        - DatasetIO
      description: >-
-        Get a paginated list of rows from a dataset. Uses cursor-based pagination.
+        Get a paginated list of rows from a dataset.
+
+        Uses offset-based pagination where:
+
+        - start_index: The starting index (0-based). If None, starts from beginning.
+
+        - limit: Number of items to return. If None or -1, returns all items.
+
+
+        The response includes:
+
+        - data: List of items for the current page
+
+        - has_more: Whether there are more items available after this set
      parameters:
        - name: dataset_id
          in: path
@ -1846,9 +1855,9 @@ paths:
        '200':
          description: OK
          content:
-            application/jsonl:
+            application/json:
              schema:
-                $ref: '#/components/schemas/ToolDef'
+                $ref: '#/components/schemas/ListToolDefsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -2787,16 +2796,33 @@ components:
      properties:
        strategy:
          $ref: '#/components/schemas/SamplingStrategy'
+          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
+          description: >-
+            The maximum number of tokens that can be generated in the completion.
+            The token count of your prompt plus max_tokens cannot exceed the model's
+            context length.
        repetition_penalty:
          type: number
          default: 1.0
+          description: >-
+            Number between -2.0 and 2.0. Positive values penalize new tokens based
+            on whether they appear in the text so far, increasing the model's likelihood
+            to talk about new topics.
+        stop:
+          type: array
+          items:
+            type: string
+          description: >-
+            Up to 4 sequences where the API will stop generating further tokens. The
+            returned text will not contain the stop sequence.
      additionalProperties: false
      required:
        - strategy
      title: SamplingParams
+      description: Sampling parameters.
    SamplingStrategy:
      oneOf:
        - $ref: '#/components/schemas/GreedySamplingStrategy'
@ -4269,39 +4295,6 @@ components:
      title: FileUploadResponse
      description: >-
        Response after initiating a file upload session.
-    FileResponse:
-      type: object
-      properties:
-        bucket:
-          type: string
-          description: >-
-            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-        key:
-          type: string
-          description: >-
-            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-        mime_type:
-          type: string
-          description: MIME type of the file
-        url:
-          type: string
-          description: Upload URL for the file contents
-        bytes:
-          type: integer
-          description: Size of the file in bytes
-        created_at:
-          type: integer
-          description: Timestamp of when the file was created
-      additionalProperties: false
-      required:
-        - bucket
-        - key
-        - mime_type
-        - url
-        - bytes
-        - created_at
-      title: FileResponse
-      description: Response representing a file entry.
    EmbeddingsRequest:
      type: object
      properties:
@ -4813,6 +4806,39 @@ components:
      title: URIDataSource
      description: >-
        A dataset that can be obtained from a URI.
+    FileResponse:
+      type: object
+      properties:
+        bucket:
+          type: string
+          description: >-
+            Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
+        key:
+          type: string
+          description: >-
+            Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        mime_type:
+          type: string
+          description: MIME type of the file
+        url:
+          type: string
+          description: Upload URL for the file contents
+        bytes:
+          type: integer
+          description: Size of the file in bytes
+        created_at:
+          type: integer
+          description: Timestamp of when the file was created
+      additionalProperties: false
+      required:
+        - bucket
+        - key
+        - mime_type
+        - url
+        - bytes
+        - created_at
+      title: FileResponse
+      description: Response representing a file entry.
    Model:
      type: object
      properties:
@ -5289,6 +5315,7 @@ components:
            - in_progress
            - failed
            - scheduled
+            - cancelled
          title: JobStatus
        scheduled_at:
          type: string
@ -5528,7 +5555,7 @@ components:
              - type: object
      additionalProperties: false
      title: ToolInvocationResult
-    IterrowsResponse:
+    PaginatedResponse:
      type: object
      properties:
        data:
@ -5543,17 +5570,18 @@ components:
                - type: string
                - type: array
                - type: object
-          description: The rows in the current page.
-        next_start_index:
-          type: integer
+          description: The list of items for the current page
+        has_more:
+          type: boolean
          description: >-
-            Index into dataset for the first row in the next page. None if there are
-            no more rows.
+            Whether there are more items available after this set
      additionalProperties: false
      required:
        - data
-      title: IterrowsResponse
-      description: A paginated list of rows from a dataset.
+        - has_more
+      title: PaginatedResponse
+      description: >-
+        A generic paginated response that follows a simple format.
    Job:
      type: object
      properties:
@ -5566,6 +5594,7 @@ components:
            - in_progress
            - failed
            - scheduled
+            - cancelled
          title: JobStatus
      additionalProperties: false
      required:
@ -5703,6 +5732,17 @@ components:
      required:
        - data
      title: ListRoutesResponse
+    ListToolDefsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolDef'
+      additionalProperties: false
+      required:
+        - data
+      title: ListToolDefsResponse
    ListScoringFunctionsResponse:
      type: object
      properties:
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -963,16 +963,19 @@
        "\n",
        "client.benchmarks.register(\n",
        "    benchmark_id=\"meta-reference::mmmu\",\n",
+        "    # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
+        "    # `input_rows` argument and does not fetch data from the dataset.\n",
        "    dataset_id=f\"mmmu-{subset}-{split}\",\n",
-        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
+        "    # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
+        "    scoring_functions=[],\n",
        ")\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::mmmu\",\n",
        "    input_rows=eval_rows,\n",
+        "    # Note: Here we define the actual scoring functions.\n",
        "    scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
        "            \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1139,12 +1142,11 @@
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        ")\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.data,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"model\",\n",
        "            \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1288,12 +1290,11 @@
        "    \"enable_session_persistence\": False,\n",
        "}\n",
        "\n",
-        "response = client.eval.evaluate_rows_alpha(\n",
+        "response = client.eval.evaluate_rows(\n",
        "    benchmark_id=\"meta-reference::simpleqa\",\n",
        "    input_rows=eval_rows.data,\n",
        "    scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
        "    benchmark_config={\n",
-        "        \"type\": \"benchmark\",\n",
        "        \"eval_candidate\": {\n",
        "            \"type\": \"agent\",\n",
        "            \"config\": agent_config,\n",
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack  # noqa: E402

 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
-from .pyopenapi.utility import Specification, validate_api_method_return_types  # noqa: E402
+from .pyopenapi.utility import Specification, validate_api  # noqa: E402


 def str_presenter(dumper, data):
@ -40,8 +40,7 @@ def main(output_dir: str):
        raise ValueError(f"Directory {output_dir} does not exist")

    # Validate API protocols before generating spec
-    print("Validating API method return types...")
-    return_type_errors = validate_api_method_return_types()
+    return_type_errors = validate_api()
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -7,10 +7,9 @@
 import json
 import typing
 import inspect
-import os
 from pathlib import Path
 from typing import TextIO
-from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
+from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args

 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
 from llama_stack.distribution.resolver import api_protocol_map
@ -125,29 +124,89 @@ def is_optional_type(type_: Any) -> bool:
    return origin is Optional or (origin is Union and type(None) in args)


-def validate_api_method_return_types() -> List[str]:
-    """Validate that all API methods have proper return types."""
+def _validate_api_method_return_type(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if is_optional_type(return_type):
+        return "returns Optional type where a return value is mandatory"
+
+
+def _validate_api_method_doesnt_return_list(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if get_origin(return_type) is list:
+        return "returns a list where a PaginatedResponse or List*Response object is expected"
+
+
+def _validate_api_delete_method_returns_none(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if return_type is not None and return_type is not type(None):
+        return "does not return None where None is mandatory"
+
+
+def _validate_list_parameters_contain_data(method) -> str | None:
+    hints = get_type_hints(method)
+
+    if 'return' not in hints:
+        return "has no return type annotation"
+
+    return_type = hints['return']
+    if not inspect.isclass(return_type):
+        return
+
+    if not return_type.__name__.startswith('List'):
+        return
+
+    if 'data' not in return_type.model_fields:
+        return "does not have a mandatory data attribute containing the list of objects"
+
+
+_VALIDATORS = {
+    "GET": [
+        _validate_api_method_return_type,
+        _validate_list_parameters_contain_data,
+        _validate_api_method_doesnt_return_list,
+    ],
+    "DELETE": [
+        _validate_api_delete_method_returns_none,
+    ],
+}
+
+
+def _get_methods_by_type(protocol, method_type: str):
+    members = inspect.getmembers(protocol, predicate=inspect.isfunction)
+    return {
+        method_name: method
+        for method_name, method in members
+        if (webmethod := getattr(method, '__webmethod__', None))
+        if webmethod and webmethod.method == method_type
+    }
+
+
+def validate_api() -> List[str]:
+    """Validate the API protocols."""
    errors = []
    protocols = api_protocol_map()

-    for protocol_name, protocol in protocols.items():
-        methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
-
-        for method_name, method in methods:
-            if not hasattr(method, '__webmethod__'):
-                continue
-
-            # Only check GET methods
-            if method.__webmethod__.method != "GET":
-                continue
-
-            hints = get_type_hints(method)
-
-            if 'return' not in hints:
-                errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
-            else:
-                return_type = hints['return']
-                if is_optional_type(return_type):
-                    errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
+    for target, validators in _VALIDATORS.items():
+        for protocol_name, protocol in protocols.items():
+            for validator in validators:
+                for method_name, method in _get_methods_by_type(protocol, target).items():
+                    err = validator(method)
+                    if err:
+                        errors.append(f"Method {protocol_name}.{method_name} {err}")

    return errors
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,4 +1,4 @@
-# Building AI Applications
+# Building AI Applications (Examples)

 Llama Stack provides all the building blocks needed to create sophisticated AI applications.

--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,4 +1,4 @@
-## Using Retrieval Augmented Generation (RAG)
+## Retrieval Augmented Generation (RAG)

 RAG enables your applications to reference and recall information from previous interactions or external documents.

--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -45,14 +45,16 @@ Here's an example that sends telemetry signals to all three sink types. Your con
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      sinks: ['console', 'sqlite', 'otel']
-      otel_endpoint: "http://localhost:4318/v1/traces"
+      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
+      otel_trace_endpoint: "http://localhost:4318/v1/traces"
+      otel_metric_endpoint: "http://localhost:4318/v1/metrics"
      sqlite_db_path: "/path/to/telemetry.db"
 ```

 ### Jaeger to visualize traces

-The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
+The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
+Let's use Jaeger to visualize this data.

 Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -16,6 +16,7 @@ from docutils import nodes
 from pathlib import Path
 import requests
 import json
+from datetime import datetime

 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
@ -28,7 +29,7 @@ with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") a
    llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"

 project = "llama-stack"
-copyright = "2025, Meta"
+copyright = f"{datetime.now().year}, Meta"
 author = "Meta"

 # -- General configuration ---------------------------------------------------
@ -37,6 +38,7 @@ author = "Meta"
 extensions = [
    "myst_parser",
    "sphinx_rtd_theme",
+    "sphinx_rtd_dark_mode",
    "sphinx_copybutton",
    "sphinx_tabs.tabs",
    "sphinx_design",
@ -103,6 +105,8 @@ source_suffix = {
 # html_theme = "alabaster"
 html_theme_options = {
    "canonical_url": "https://github.com/meta-llama/llama-stack",
+    'collapse_navigation': False,
+
    # "style_nav_header_background": "#c3c9d4",
 }

--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -1,14 +1,14 @@
-# Contributing to Llama Stack

-Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
+```{include} ../../../CONTRIBUTING.md
+```
+
+See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
+

- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.

 ```{toctree}
 :maxdepth: 1
 :hidden:

 new_api_provider
-testing
 ```
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -67,7 +67,7 @@ options:
                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
                        conda)
  --image-name IMAGE_NAME
-                        [for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
+                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,4 +1,4 @@
-# Configuring a Stack
+# Configuring a "Stack"

 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,10 +1,12 @@
 # Using Llama Stack as a Library

-If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
+## Setup Llama Stack without a Server
+If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
+This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
-llama stack build --template together --image-type venv
+llama stack build --template ollama --image-type venv
 ```

 ```python
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -1,34 +1,18 @@
-# Starting a Llama Stack Server
+# Distributions Overview

-You can run a Llama Stack server in one of the following ways:
-
-**As a Library**:
-
-This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
-
-
-**Container**:
-
-Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
-
-
-**Conda**:
-
-If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
-
-
-**Kubernetes**:
-
-If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
+A distribution is a pre-packaged set of Llama Stack components that can be deployed together.

+This section provides an overview of the distributions available in Llama Stack.

 ```{toctree}
-:maxdepth: 1
-:hidden:
+:maxdepth: 3

 importing_as_library
-building_distro
 configuration
-selection
+list_of_distributions
 kubernetes_deployment
+building_distro
+on_device_distro
+remote_hosted_distro
+self_hosted_distro
 ```
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -1,6 +1,9 @@
 # Kubernetes Deployment Guide

-Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
+
+### Prerequisites
+In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.

 First, create a local Kubernetes cluster via Kind:

@ -8,7 +11,7 @@ First, create a local Kubernetes cluster via Kind:
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```

-Start vLLM server as a Kubernetes Pod and Service:
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:

 ```bash
 cat <<EOF |kubectl apply -f -
@ -31,7 +34,13 @@ metadata:
 type: Opaque
 data:
  token: $(HF_TOKEN)
---
+```
+
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@ -47,28 +56,23 @@ spec:
        app.kubernetes.io/name: vllm
    spec:
      containers:
-      - name: llama-stack
-        image: $(VLLM_IMAGE)
-        command:
-            - bash
-            - -c
-            - |
-              MODEL="meta-llama/Llama-3.2-1B-Instruct"
-              MODEL_PATH=/app/model/$(basename $MODEL)
-              huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
-              huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
-              python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
        ports:
          - containerPort: 8000
        volumeMounts:
          - name: llama-storage
-            mountPath: /app/model
-        env:
-          - name: HUGGING_FACE_HUB_TOKEN
-            valueFrom:
-              secretKeyRef:
-                name: hf-token-secret
-                key: token
+            mountPath: /root/.cache/huggingface
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
@ -127,6 +131,7 @@ EOF
 podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
 ```

+### Deploying Llama Stack Server in Kubernetes

 We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:

@ -187,6 +192,7 @@ spec:
 EOF
 ```

+### Verifying the Deployment
 We can check that the LlamaStack server has started:

 ```bash
--- a/docs/source/distributions/list_of_distributions.md
+++ b/docs/source/distributions/list_of_distributions.md
@ -1,4 +1,4 @@
-# List of Distributions
+# Available List of Distributions

 Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.

--- a/docs/source/distributions/remote_hosted_distro/nvidia.md
+++ b/docs/source/distributions/remote_hosted_distro/nvidia.md
@ -9,6 +9,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | datasetio | `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::nvidia` |
+| post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
 | telemetry | `inline::meta-reference` |
@ -21,6 +22,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 The following environment variables can be configured:

 - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
+- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
+- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
+- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
+- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
+- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
+- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
 - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
 - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
 - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -98,11 +98,14 @@ export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export LLAMA_STACK_PORT=8321

+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+
 docker run \
-  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
  --yaml-config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
@ -121,7 +124,6 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 cd /path/to/llama-stack

 docker run \
-  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
--- a/docs/source/distributions/starting_llama_stack_server.md
+++ b/docs/source/distributions/starting_llama_stack_server.md
@ -0,0 +1,32 @@
+# Starting a Llama Stack Server
+
+You can run a Llama Stack server in one of the following ways:
+
+**As a Library**:
+
+This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
+
+
+**Container**:
+
+Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
+
+
+**Conda**:
+
+If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
+**Kubernetes**:
+
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
+
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+importing_as_library
+configuration
+kubernetes_deployment
+```
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -1,10 +1,11 @@
 # Quick Start

-In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.
+In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to build a simple [RAG (Retrieval Augmented Generation)](../building_applications/rag.md) agent.

 A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.

 In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
+Ollama is an LLM runtime that allows you to run Llama models locally.


 ### 1. Start Ollama
@ -24,7 +25,7 @@ If you do not have ollama, you can install it from [here](https://ollama.com/dow

 ### 2. Pick a client environment

-Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
+Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through a REST interface. You can interact with the Stack in two ways:

 * Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
 * Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -6,6 +6,7 @@ Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_v

 # Llama Stack

+## What is Llama Stack?

 Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides

@ -22,6 +23,12 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge

 Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.

+## How does Llama Stack work?
+Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and [client SDKs](#available-sdks) meant to
+be used in your applications. The server can be run in a variety of environments, including local (inline)
+development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
+Kotlin.
+
 ## Quick Links

 - New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
@ -93,7 +100,6 @@ getting_started/index
 concepts/index
 providers/index
 distributions/index
-distributions/selection
 building_applications/index
 playground/index
 contributing/index
--- a/docs/source/playground/index.md
+++ b/docs/source/playground/index.md
@ -92,8 +92,6 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie

 ## Starting the Llama Stack Playground

-### Llama CLI
-
 To start the Llama Stack Playground, run the following commands:

 1. Start up the Llama Stack API server
@ -109,29 +107,3 @@ cd llama_stack/distribution/ui
 pip install -r requirements.txt
 streamlit run app.py
 ```
-
-### Docker
-
-Playground can also be started in a docker image:
-
-```sh
-export LLAMA_STACK_URL=http://localhost:11434
-
-docker run \
-  --pull always \
-  -p 8501:8501 \
-  -e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
-  quay.io/jland/llama-stack-playground
-```
-
-## Configurable Environment Variables
-
-## Environment Variables
-
-| Environment Variable       | Description                        | Default Value             |
-|----------------------------|------------------------------------|---------------------------|
-| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
-| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
-| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
-| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
-| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@ -10,11 +10,57 @@ That means you're not limited to storing vectors in memory or in a separate serv
 ## Features

 - Lightweight and easy to use
- Fully integrated with Llama Stack
+- Fully integrated with Llama Stacks
+- Uses disk-based storage for persistence, allowing for larger vector storage
+
+### Comparison to Faiss
+
+The choice between Faiss and sqlite-vec should be made based on the needs of your application,
+as they have different strengths.
+
+#### Choosing the Right Provider
+
+Scenario | Recommended Tool | Reason
+-- |-----------------| --
+Online Analytical Processing (OLAP) | Faiss           | Fast, in-memory searches
+Online Transaction Processing (OLTP) | sqlite-vec      | Frequent writes and reads
+Frequent writes | sqlite-vec      | Efficient disk-based storage and incremental indexing
+Large datasets | sqlite-vec      | Disk-based storage for larger vector storage
+Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
+
+#### Empirical Example
+
+Consider the histogram below in which 10,000 randomly generated strings were inserted
+in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+
+```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss write times
+:width: 400px
+```
+
+You will notice that the average write time for `sqlite-vec` was 788ms, compared to
+47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
+uniformly spread across the [1500, 100000] interval.
+
+Looking at each individual write in the order that the documents are inserted you'll see the increase in
+write speed as Faiss reindexes the vectors after each write.
+```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss write times
+:width: 400px
+```
+
+In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
+The modes of the two distributions highlight the differences much further where Faiss
+will likely yield faster read performance.
+
+```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+:alt: Comparison of SQLite-Vec and Faiss read times
+:width: 400px
+```

 ## Usage

-To use SQLite-Vec in your Llama Stack project, follow these steps:
+To use sqlite-vec in your Llama Stack project, follow these steps:

 1. Install the necessary dependencies.
 2. Configure your Llama Stack project to use SQLite-Vec.