diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 54c01c80d..5884f2582 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,4 @@ # These owners will be the default owners for everything in # the repo. Unless a later match takes precedence, -* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb +* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning diff --git a/.github/TRIAGERS.md b/.github/TRIAGERS.md index d4ef6d1ac..586a5a506 100644 --- a/.github/TRIAGERS.md +++ b/.github/TRIAGERS.md @@ -1,2 +1,2 @@ # This file documents Triage members in the Llama Stack community -@franciscojavierarceo @leseb + @bbrowning @booxter @franciscojavierarceo @leseb diff --git a/.github/workflows/integration-auth-tests.yml b/.github/workflows/integration-auth-tests.yml index 3c155195f..82a76ad32 100644 --- a/.github/workflows/integration-auth-tests.yml +++ b/.github/workflows/integration-auth-tests.yml @@ -28,12 +28,13 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install uv - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" + activate-environment: true - name: Set Up Environment and Install Dependencies run: | @@ -43,7 +44,7 @@ jobs: - name: Install minikube if: ${{ matrix.auth-provider == 'kubernetes' }} - uses: medyagh/setup-minikube@latest + uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19 - name: Start minikube if: ${{ matrix.auth-provider == 'kubernetes' }} @@ -74,32 +75,8 @@ jobs: cat <<'EOF' > $run_dir/run.yaml version: '2' image_name: kube - apis: - - agents - - datasetio - - eval - - inference - - safety - - scoring - - telemetry - - tool_runtime - - vector_io - providers: - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:\u200B}" - sinks: ${env.TELEMETRY_SINKS:console,sqlite} - sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} + apis: [] + providers: {} server: port: 8321 EOF diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index f82a7cdd2..da41e2185 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -33,7 +33,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install uv - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" activate-environment: true @@ -58,7 +58,7 @@ jobs: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" run: | source .venv/bin/activate - nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 & + LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv & - name: Wait for Llama Stack server to be ready if: matrix.client-type == 'http' @@ -85,6 +85,11 @@ jobs: echo "Ollama health check failed" exit 1 fi + - name: Check Storage and Memory Available Before Tests + if: ${{ always() }} + run: | + free -h + df -h - name: Run Integration Tests env: @@ -100,13 +105,20 @@ jobs: --text-model="meta-llama/Llama-3.2-3B-Instruct" \ --embedding-model=all-MiniLM-L6-v2 + - name: Check Storage and Memory Available After Tests + if: ${{ always() }} + run: | + free -h + df -h + - name: Write ollama logs to file + if: ${{ always() }} run: | sudo journalctl -u ollama.service > ollama.log - name: Upload all logs to artifacts - if: always() - uses: actions/upload-artifact@v4 + if: ${{ always() }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }} path: | diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml index 7c136907c..3c1682833 100644 --- a/.github/workflows/providers-build.yml +++ b/.github/workflows/providers-build.yml @@ -56,7 +56,7 @@ jobs: python-version: '3.10' - name: Install uv - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" @@ -94,7 +94,7 @@ jobs: python-version: '3.10' - name: Install uv - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" @@ -120,7 +120,7 @@ jobs: python-version: '3.10' - name: Install uv - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" @@ -132,9 +132,9 @@ jobs: - name: Build a single provider run: | - yq -i '.image_type = "container"' llama_stack/templates/dev/build.yaml - yq -i '.image_name = "test"' llama_stack/templates/dev/build.yaml - USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/dev/build.yaml + yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml + yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml + USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml - name: Inspect the container image entrypoint run: | @@ -158,7 +158,7 @@ jobs: python-version: '3.10' - name: Install uv - uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" @@ -174,14 +174,14 @@ jobs: .image_type = "container" | .image_name = "ubi9-test" | .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest" - ' llama_stack/templates/dev/build.yaml + ' llama_stack/templates/starter/build.yaml - name: Build dev container (UBI9) env: USE_COPY_NOT_MOUNT: "true" LLAMA_STACK_DIR: "." run: | - uv run llama stack build --config llama_stack/templates/dev/build.yaml + uv run llama stack build --config llama_stack/templates/starter/build.yaml - name: Inspect UBI9 image run: | diff --git a/.github/workflows/test-external-providers.yml b/.github/workflows/test-external-providers.yml index b5b9645c1..2e18fc5eb 100644 --- a/.github/workflows/test-external-providers.yml +++ b/.github/workflows/test-external-providers.yml @@ -23,10 +23,10 @@ jobs: # container and point 'uv pip install' to the correct path... steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install uv - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: "3.10" @@ -47,8 +47,8 @@ jobs: - name: Create provider configuration run: | - mkdir -p /tmp/providers.d/remote/inference - cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /tmp/providers.d/remote/inference/custom_ollama.yaml + mkdir -p /home/runner/.llama/providers.d/remote/inference + cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml - name: Build distro from config file run: | @@ -66,7 +66,7 @@ jobs: - name: Wait for Llama Stack server to be ready run: | for i in {1..30}; do - if ! grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then + if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then echo "Waiting for Llama Stack server to load the provider..." sleep 1 else @@ -75,4 +75,5 @@ jobs: fi done echo "Provider failed to load" + cat server.log exit 1 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 64a5bba37..d2dd34e05 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -37,7 +37,7 @@ jobs: with: python-version: ${{ matrix.python }} - - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + - uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 with: python-version: ${{ matrix.python }} enable-cache: false diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml index 21e3b633d..04e23bca9 100644 --- a/.github/workflows/update-readthedocs.yml +++ b/.github/workflows/update-readthedocs.yml @@ -14,6 +14,8 @@ on: - 'docs/**' - 'pyproject.toml' - '.github/workflows/update-readthedocs.yml' + tags: + - '*' pull_request: branches: - main @@ -41,7 +43,7 @@ jobs: python-version: '3.11' - name: Install the latest version of uv - uses: astral-sh/setup-uv@c7f87aa956e4c323abf06d5dec078e358f6b4d04 # v6.0.0 + uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1 - name: Sync with uv run: uv sync --extra docs @@ -61,7 +63,10 @@ jobs: response=$(curl -X POST \ -H "Content-Type: application/json" \ - -d "{\"token\": \"$TOKEN\"}" \ + -d "{ + \"token\": \"$TOKEN\", + \"version\": \"$GITHUB_REF_NAME\" + }" \ https://readthedocs.org/api/v2/webhook/llama-stack/289768/) echo "Response: $response" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42228d828..e78fcd158 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -106,6 +106,14 @@ repos: pass_filenames: false require_serial: true files: ^llama_stack/apis/|^docs/openapi_generator/ + - id: check-workflows-use-hashes + name: Check GitHub Actions use SHA-pinned actions + entry: ./scripts/check-workflows-use-hashes.sh + language: system + pass_filenames: false + require_serial: true + always_run: true + files: ^\.github/workflows/.*\.ya?ml$ ci: autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks diff --git a/CHANGELOG.md b/CHANGELOG.md index c57f3e253..a0b008982 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ # Changelog +# v0.2.5 +Published on: 2025-05-04T20:16:49Z + + + +--- + +# v0.2.4 +Published on: 2025-04-29T17:26:01Z + +## Highlights + +* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383 +* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852 +* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778 +* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989 +* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058 + + +--- + # v0.2.3 Published on: 2025-04-25T22:46:21Z diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 17eed43c5..d7c3e3e2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -110,25 +110,9 @@ uv run pre-commit run --all-files > [!CAUTION] > Before pushing your changes, make sure that the pre-commit hooks have passed successfully. -## Running unit tests +## Running tests -You can run the unit tests by running: - -```bash -source .venv/bin/activate -./scripts/unit-tests.sh -``` - -If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows: - -``` -source .venv/bin/activate -PYTHON_VERSION=3.13 ./scripts/unit-tests.sh -``` - -## Running integration tests - -You can run integration tests following the instructions [here](tests/integration/README.md). +You can find the Llama Stack testing documentation here [here](tests/README.md). ## Adding a new dependency to the project @@ -153,6 +137,8 @@ uv sync justification for bypassing the check. * When using `# type: ignore` to suppress a mypy warning, include a comment explaining the justification for bypassing the check. +* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or + readability reasons. ## Common Tasks diff --git a/README.md b/README.md index b2b2d12d9..5dfe3577a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain) [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain) -[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) +[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack) ### ✨🎉 Llama 4 Support 🎉✨ We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 2875f0f41..6378a5ced 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -62,11 +62,12 @@ "tags": [ "DatasetIO" ], - "description": "", + "description": "Append rows to a dataset.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to append the rows to.", "required": true, "schema": { "type": "string" @@ -89,7 +90,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A BatchChatCompletionResponse with the full completions.", "content": { "application/json": { "schema": { @@ -114,7 +115,7 @@ "tags": [ "Inference" ], - "description": "", + "description": "Generate chat completions for a batch of messages using the specified model.", "parameters": [], "requestBody": { "content": { @@ -132,7 +133,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A BatchCompletionResponse with the full completions.", "content": { "application/json": { "schema": { @@ -157,7 +158,7 @@ "tags": [ "Inference" ], - "description": "", + "description": "Generate completions for a batch of content using the specified model.", "parameters": [], "requestBody": { "content": { @@ -193,7 +194,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Cancel a training job.", "parameters": [], "requestBody": { "content": { @@ -211,7 +212,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk", + "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -259,7 +260,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk", + "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -307,11 +308,11 @@ "get": { "responses": { "200": { - "description": "A ListAgentsResponse.", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListAgentsResponse" + "$ref": "#/components/schemas/PaginatedResponse" } } } @@ -333,7 +334,26 @@ "Agents" ], "description": "List all agents.", - "parameters": [] + "parameters": [ + { + "name": "start_index", + "in": "query", + "description": "The index to start the pagination from.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "description": "The number of agents to return.", + "required": false, + "schema": { + "type": "integer" + } + } + ] }, "post": { "responses": { @@ -434,7 +454,7 @@ "post": { "responses": { "200": { - "description": "If stream=False, returns a Turn object. If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk", + "description": "If stream=False, returns a Turn object. If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.", "content": { "application/json": { "schema": { @@ -501,7 +521,7 @@ "post": { "responses": { "200": { - "description": "Runtime representation of an annotated type.", + "description": "An OpenAIResponseObject.", "content": { "application/json": { "schema": { @@ -549,7 +569,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ListBucketResponse.", "content": { "application/json": { "schema": { @@ -579,6 +599,7 @@ { "name": "bucket", "in": "query", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -589,7 +610,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A FileUploadResponse.", "content": { "application/json": { "schema": { @@ -691,7 +712,7 @@ "tags": [ "Agents" ], - "description": "Delete an agent by its ID.", + "description": "Delete an agent by its ID and its associated sessions and turns.", "parameters": [ { "name": "agent_id", @@ -709,7 +730,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Session.", "content": { "application/json": { "schema": { @@ -789,7 +810,7 @@ "tags": [ "Agents" ], - "description": "Delete an agent session by its ID.", + "description": "Delete an agent session by its ID and its associated turns.", "parameters": [ { "name": "session_id", @@ -816,7 +837,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A FileResponse.", "content": { "application/json": { "schema": { @@ -846,7 +867,7 @@ { "name": "bucket", "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -855,7 +876,7 @@ { "name": "key", "in": "path", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)", + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).", "required": true, "schema": { "type": "string" @@ -889,7 +910,7 @@ { "name": "bucket", "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", "required": true, "schema": { "type": "string" @@ -898,7 +919,7 @@ { "name": "key", "in": "path", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)", + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).", "required": true, "schema": { "type": "string" @@ -911,7 +932,7 @@ "post": { "responses": { "200": { - "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}", + "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.", "content": { "application/json": { "schema": { @@ -954,7 +975,7 @@ "post": { "responses": { "200": { - "description": "EvaluateResponse object containing generations and scores", + "description": "EvaluateResponse object containing generations and scores.", "content": { "application/json": { "schema": { @@ -1138,7 +1159,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Benchmark.", "content": { "application/json": { "schema": { @@ -1163,11 +1184,55 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Get a benchmark by its ID.", "parameters": [ { "name": "benchmark_id", "in": "path", + "description": "The ID of the benchmark to get.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/openai/v1/chat/completions/{completion_id}": { + "get": { + "responses": { + "200": { + "description": "A OpenAICompletionWithInputMessages.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAICompletionWithInputMessages" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Describe a chat completion by its ID.", + "parameters": [ + { + "name": "completion_id", + "in": "path", + "description": "ID of the chat completion.", "required": true, "schema": { "type": "string" @@ -1180,7 +1245,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Dataset.", "content": { "application/json": { "schema": { @@ -1205,11 +1270,12 @@ "tags": [ "Datasets" ], - "description": "", + "description": "Get a dataset by its ID.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to get.", "required": true, "schema": { "type": "string" @@ -1238,11 +1304,12 @@ "tags": [ "Datasets" ], - "description": "", + "description": "Unregister a dataset by its ID.", "parameters": [ { "name": "dataset_id", "in": "path", + "description": "The ID of the dataset to unregister.", "required": true, "schema": { "type": "string" @@ -1255,7 +1322,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Model.", "content": { "application/json": { "schema": { @@ -1280,11 +1347,12 @@ "tags": [ "Models" ], - "description": "", + "description": "Get a model by its identifier.", "parameters": [ { "name": "model_id", "in": "path", + "description": "The identifier of the model to get.", "required": true, "schema": { "type": "string" @@ -1313,11 +1381,12 @@ "tags": [ "Models" ], - "description": "", + "description": "Unregister a model.", "parameters": [ { "name": "model_id", "in": "path", + "description": "The identifier of the model to unregister.", "required": true, "schema": { "type": "string" @@ -1373,7 +1442,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ScoringFn.", "content": { "application/json": { "schema": { @@ -1398,11 +1467,12 @@ "tags": [ "ScoringFunctions" ], - "description": "", + "description": "Get a scoring function by its ID.", "parameters": [ { "name": "scoring_fn_id", "in": "path", + "description": "The ID of the scoring function to get.", "required": true, "schema": { "type": "string" @@ -1415,7 +1485,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Shield.", "content": { "application/json": { "schema": { @@ -1440,11 +1510,12 @@ "tags": [ "Shields" ], - "description": "", + "description": "Get a shield by its identifier.", "parameters": [ { "name": "identifier", "in": "path", + "description": "The identifier of the shield to get.", "required": true, "schema": { "type": "string" @@ -1457,7 +1528,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Span.", "content": { "application/json": { "schema": { @@ -1482,11 +1553,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a span by its ID.", "parameters": [ { "name": "trace_id", "in": "path", + "description": "The ID of the trace to get the span from.", "required": true, "schema": { "type": "string" @@ -1495,6 +1567,7 @@ { "name": "span_id", "in": "path", + "description": "The ID of the span to get.", "required": true, "schema": { "type": "string" @@ -1507,7 +1580,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QuerySpanTreeResponse.", "content": { "application/json": { "schema": { @@ -1532,11 +1605,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a span tree by its ID.", "parameters": [ { "name": "span_id", "in": "path", + "description": "The ID of the span to get the tree from.", "required": true, "schema": { "type": "string" @@ -1559,7 +1633,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Tool.", "content": { "application/json": { "schema": { @@ -1584,11 +1658,12 @@ "tags": [ "ToolGroups" ], - "description": "", + "description": "Get a tool by its name.", "parameters": [ { "name": "tool_name", "in": "path", + "description": "The name of the tool to get.", "required": true, "schema": { "type": "string" @@ -1601,7 +1676,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ToolGroup.", "content": { "application/json": { "schema": { @@ -1626,11 +1701,12 @@ "tags": [ "ToolGroups" ], - "description": "", + "description": "Get a tool group by its ID.", "parameters": [ { "name": "toolgroup_id", "in": "path", + "description": "The ID of the tool group to get.", "required": true, "schema": { "type": "string" @@ -1659,11 +1735,12 @@ "tags": [ "ToolGroups" ], - "description": "Unregister a tool group", + "description": "Unregister a tool group.", "parameters": [ { "name": "toolgroup_id", "in": "path", + "description": "The ID of the tool group to unregister.", "required": true, "schema": { "type": "string" @@ -1676,7 +1753,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A Trace.", "content": { "application/json": { "schema": { @@ -1701,11 +1778,12 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Get a trace by its ID.", "parameters": [ { "name": "trace_id", "in": "path", + "description": "The ID of the trace to get.", "required": true, "schema": { "type": "string" @@ -1718,7 +1796,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJobArtifactsResponse.", "content": { "application/json": { "schema": { @@ -1743,11 +1821,12 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get the artifacts of a training job.", "parameters": [ { "name": "job_uuid", "in": "query", + "description": "The UUID of the job to get the artifacts of.", "required": true, "schema": { "type": "string" @@ -1760,7 +1839,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJobStatusResponse.", "content": { "application/json": { "schema": { @@ -1785,11 +1864,12 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get the status of a training job.", "parameters": [ { "name": "job_uuid", "in": "query", + "description": "The UUID of the job to get the status of.", "required": true, "schema": { "type": "string" @@ -1802,7 +1882,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ListPostTrainingJobsResponse.", "content": { "application/json": { "schema": { @@ -1827,7 +1907,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Get all training jobs.", "parameters": [] } }, @@ -1835,7 +1915,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A FileUploadResponse.", "content": { "application/json": { "schema": { @@ -1860,12 +1940,12 @@ "tags": [ "Files" ], - "description": "Returns information about an existsing upload session", + "description": "Returns information about an existsing upload session.", "parameters": [ { "name": "upload_id", "in": "path", - "description": "ID of the upload session", + "description": "ID of the upload session.", "required": true, "schema": { "type": "string" @@ -1876,7 +1956,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A FileResponse or None if the upload is not complete.", "content": { "application/json": { "schema": { @@ -1913,7 +1993,7 @@ { "name": "upload_id", "in": "path", - "description": "ID of the upload session", + "description": "ID of the upload session.", "required": true, "schema": { "type": "string" @@ -1937,7 +2017,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A VectorDB.", "content": { "application/json": { "schema": { @@ -1962,11 +2042,12 @@ "tags": [ "VectorDBs" ], - "description": "", + "description": "Get a vector database by its identifier.", "parameters": [ { "name": "vector_db_id", "in": "path", + "description": "The identifier of the vector database to get.", "required": true, "schema": { "type": "string" @@ -1995,11 +2076,12 @@ "tags": [ "VectorDBs" ], - "description": "", + "description": "Unregister a vector database.", "parameters": [ { "name": "vector_db_id", "in": "path", + "description": "The identifier of the vector database to unregister.", "required": true, "schema": { "type": "string" @@ -2012,7 +2094,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A HealthInfo.", "content": { "application/json": { "schema": { @@ -2037,7 +2119,7 @@ "tags": [ "Inspect" ], - "description": "", + "description": "Get the health of the service.", "parameters": [] } }, @@ -2099,7 +2181,7 @@ "tags": [ "VectorIO" ], - "description": "", + "description": "Insert chunks into a vector database.", "parameters": [], "requestBody": { "content": { @@ -2117,7 +2199,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ProviderInfo object containing the provider's details.", "content": { "application/json": { "schema": { @@ -2142,11 +2224,12 @@ "tags": [ "Providers" ], - "description": "", + "description": "Get detailed information about a specific provider.", "parameters": [ { "name": "provider_id", "in": "path", + "description": "The ID of the provider to inspect.", "required": true, "schema": { "type": "string" @@ -2159,7 +2242,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A ToolInvocationResult.", "content": { "application/json": { "schema": { @@ -2184,7 +2267,7 @@ "tags": [ "ToolRuntime" ], - "description": "Run a tool with the given arguments", + "description": "Run a tool with the given arguments.", "parameters": [], "requestBody": { "content": { @@ -2202,7 +2285,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { @@ -2227,7 +2310,7 @@ "tags": [ "DatasetIO" ], - "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set", + "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page.\n- has_more: Whether there are more items available after this set.", "parameters": [ { "name": "dataset_id", @@ -2263,7 +2346,7 @@ "get": { "responses": { "200": { - "description": "The status of the evaluationjob.", + "description": "The status of the evaluation job.", "content": { "application/json": { "schema": { @@ -2410,11 +2493,11 @@ "get": { "responses": { "200": { - "description": "A ListAgentSessionsResponse.", + "description": "A PaginatedResponse.", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ListAgentSessionsResponse" + "$ref": "#/components/schemas/PaginatedResponse" } } } @@ -2445,6 +2528,24 @@ "schema": { "type": "string" } + }, + { + "name": "start_index", + "in": "query", + "description": "The index to start the pagination from.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "limit", + "in": "query", + "description": "The number of sessions to return.", + "required": false, + "schema": { + "type": "integer" + } } ] } @@ -2453,7 +2554,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A ListBenchmarksResponse.", "content": { "application/json": { "schema": { @@ -2478,7 +2579,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "List all benchmarks.", "parameters": [] }, "post": { @@ -2502,7 +2603,7 @@ "tags": [ "Benchmarks" ], - "description": "", + "description": "Register a benchmark.", "parameters": [], "requestBody": { "content": { @@ -2516,678 +2617,79 @@ } } }, - "/v1/datasets": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListDatasetsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Datasets" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Dataset" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Datasets" - ], - "description": "Register a new dataset.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterDatasetRequest" - } - } - }, - "required": true - } - } - }, - "/v1/files/{bucket}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListFileResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Files" - ], - "description": "List all files in a bucket.", - "parameters": [ - { - "name": "bucket", - "in": "path", - "description": "Bucket name (valid chars: a-zA-Z0-9_-)", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/models": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListModelsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Models" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Model" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Models" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterModelRequest" - } - } - }, - "required": true - } - } - }, - "/v1/providers": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListProvidersResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Providers" - ], - "description": "", - "parameters": [] - } - }, - "/v1/inspect/routes": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListRoutesResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Inspect" - ], - "description": "", - "parameters": [] - } - }, - "/v1/tool-runtime/list-tools": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolDefsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolRuntime" - ], - "description": "", - "parameters": [ - { - "name": "tool_group_id", - "in": "query", - "required": false, - "schema": { - "type": "string" - } - }, - { - "name": "mcp_endpoint", - "in": "query", - "required": false, - "schema": { - "$ref": "#/components/schemas/URL" - } - } - ] - } - }, - "/v1/scoring-functions": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListScoringFunctionsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterScoringFunctionRequest" - } - } - }, - "required": true - } - } - }, - "/v1/shields": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListShieldsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Shields" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Shield" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Shields" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterShieldRequest" - } - } - }, - "required": true - } - } - }, - "/v1/toolgroups": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolGroupsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "List tool groups with optional provider", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "Register a tool group", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterToolGroupRequest" - } - } - }, - "required": true - } - } - }, - "/v1/tools": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListToolsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ToolGroups" - ], - "description": "List tools with optional tool group", - "parameters": [ - { - "name": "toolgroup_id", - "in": "query", - "required": false, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/vector-dbs": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListVectorDBsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "VectorDBs" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/VectorDB" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "VectorDBs" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterVectorDbRequest" - } - } - }, - "required": true - } - } - }, - "/v1/telemetry/events": { - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Telemetry" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/LogEventRequest" - } - } - }, - "required": true - } - } - }, "/v1/openai/v1/chat/completions": { + "get": { + "responses": { + "200": { + "description": "A ListOpenAIChatCompletionResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListOpenAIChatCompletionResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "List all chat completions.", + "parameters": [ + { + "name": "after", + "in": "query", + "description": "The ID of the last chat completion to return.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "limit", + "in": "query", + "description": "The maximum number of chat completions to return.", + "required": false, + "schema": { + "type": "integer" + } + }, + { + "name": "model", + "in": "query", + "description": "The model to filter by.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "order", + "in": "query", + "description": "The order to sort the chat completions by: \"asc\" or \"desc\". Defaults to \"desc\".", + "required": false, + "schema": { + "$ref": "#/components/schemas/Order" + } + } + ] + }, "post": { "responses": { "200": { - "description": "Response from an OpenAI-compatible chat completion request. **OR** Chunk from a streaming response to an OpenAI-compatible chat completion request.", + "description": "An OpenAIChatCompletion.", "content": { "application/json": { "schema": { @@ -3233,11 +2735,681 @@ } } }, + "/v1/datasets": { + "get": { + "responses": { + "200": { + "description": "A ListDatasetsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListDatasetsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Datasets" + ], + "description": "List all datasets.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Dataset.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Dataset" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Datasets" + ], + "description": "Register a new dataset.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterDatasetRequest" + } + } + }, + "required": true + } + } + }, + "/v1/files/{bucket}": { + "get": { + "responses": { + "200": { + "description": "A ListFileResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListFileResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Files" + ], + "description": "List all files in a bucket.", + "parameters": [ + { + "name": "bucket", + "in": "path", + "description": "Bucket name (valid chars: a-zA-Z0-9_-).", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/models": { + "get": { + "responses": { + "200": { + "description": "A ListModelsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListModelsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Models" + ], + "description": "List all models.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Model.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Model" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Models" + ], + "description": "Register a model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterModelRequest" + } + } + }, + "required": true + } + } + }, + "/v1/providers": { + "get": { + "responses": { + "200": { + "description": "A ListProvidersResponse containing information about all providers.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListProvidersResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Providers" + ], + "description": "List all available providers.", + "parameters": [] + } + }, + "/v1/inspect/routes": { + "get": { + "responses": { + "200": { + "description": "A ListRoutesResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListRoutesResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inspect" + ], + "description": "List all routes.", + "parameters": [] + } + }, + "/v1/tool-runtime/list-tools": { + "get": { + "responses": { + "200": { + "description": "A ListToolDefsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolDefsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolRuntime" + ], + "description": "List all tools in the runtime.", + "parameters": [ + { + "name": "tool_group_id", + "in": "query", + "description": "The ID of the tool group to list tools for.", + "required": false, + "schema": { + "type": "string" + } + }, + { + "name": "mcp_endpoint", + "in": "query", + "description": "The MCP endpoint to use for the tool group.", + "required": false, + "schema": { + "$ref": "#/components/schemas/URL" + } + } + ] + } + }, + "/v1/scoring-functions": { + "get": { + "responses": { + "200": { + "description": "A ListScoringFunctionsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListScoringFunctionsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "List all scoring functions.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ScoringFunctions" + ], + "description": "Register a scoring function.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterScoringFunctionRequest" + } + } + }, + "required": true + } + } + }, + "/v1/shields": { + "get": { + "responses": { + "200": { + "description": "A ListShieldsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListShieldsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Shields" + ], + "description": "List all shields.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A Shield.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Shield" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Shields" + ], + "description": "Register a shield.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterShieldRequest" + } + } + }, + "required": true + } + } + }, + "/v1/toolgroups": { + "get": { + "responses": { + "200": { + "description": "A ListToolGroupsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolGroupsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "List tool groups with optional provider.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "Register a tool group.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterToolGroupRequest" + } + } + }, + "required": true + } + } + }, + "/v1/tools": { + "get": { + "responses": { + "200": { + "description": "A ListToolsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListToolsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "ToolGroups" + ], + "description": "List tools with optional tool group.", + "parameters": [ + { + "name": "toolgroup_id", + "in": "query", + "description": "The ID of the tool group to list tools for.", + "required": false, + "schema": { + "type": "string" + } + } + ] + } + }, + "/v1/vector-dbs": { + "get": { + "responses": { + "200": { + "description": "A ListVectorDBsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListVectorDBsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "VectorDBs" + ], + "description": "List all vector databases.", + "parameters": [] + }, + "post": { + "responses": { + "200": { + "description": "A VectorDB.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/VectorDB" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "VectorDBs" + ], + "description": "Register a vector database.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RegisterVectorDbRequest" + } + } + }, + "required": true + } + } + }, + "/v1/telemetry/events": { + "post": { + "responses": { + "200": { + "description": "OK" + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Telemetry" + ], + "description": "Log an event.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/LogEventRequest" + } + } + }, + "required": true + } + } + }, "/v1/openai/v1/completions": { "post": { "responses": { "200": { - "description": "OK", + "description": "An OpenAICompletion.", "content": { "application/json": { "schema": { @@ -3280,7 +3452,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A OpenAIListModelsResponse.", "content": { "application/json": { "schema": { @@ -3305,7 +3477,7 @@ "tags": [ "Models" ], - "description": "", + "description": "List models using the OpenAI API.", "parameters": [] } }, @@ -3313,7 +3485,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJob.", "content": { "application/json": { "schema": { @@ -3338,7 +3510,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Run preference optimization of a model.", "parameters": [], "requestBody": { "content": { @@ -3399,7 +3571,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QueryChunksResponse.", "content": { "application/json": { "schema": { @@ -3424,7 +3596,7 @@ "tags": [ "VectorIO" ], - "description": "", + "description": "Query chunks from a vector database.", "parameters": [], "requestBody": { "content": { @@ -3438,11 +3610,64 @@ } } }, + "/v1/telemetry/metrics/{metric_name}": { + "post": { + "responses": { + "200": { + "description": "A QueryMetricsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Telemetry" + ], + "description": "Query metrics.", + "parameters": [ + { + "name": "metric_name", + "in": "path", + "description": "The name of the metric to query.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsRequest" + } + } + }, + "required": true + } + } + }, "/v1/telemetry/spans": { "post": { "responses": { "200": { - "description": "OK", + "description": "A QuerySpansResponse.", "content": { "application/json": { "schema": { @@ -3467,7 +3692,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Query spans.", "parameters": [], "requestBody": { "content": { @@ -3485,7 +3710,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A QueryTracesResponse.", "content": { "application/json": { "schema": { @@ -3510,7 +3735,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Query traces.", "parameters": [], "requestBody": { "content": { @@ -3657,7 +3882,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A RunShieldResponse.", "content": { "application/json": { "schema": { @@ -3682,7 +3907,7 @@ "tags": [ "Safety" ], - "description": "", + "description": "Run a shield.", "parameters": [], "requestBody": { "content": { @@ -3718,7 +3943,7 @@ "tags": [ "Telemetry" ], - "description": "", + "description": "Save spans to a dataset.", "parameters": [], "requestBody": { "content": { @@ -3736,7 +3961,7 @@ "post": { "responses": { "200": { - "description": "ScoreResponse object containing rows and aggregated results", + "description": "A ScoreResponse object containing rows and aggregated results.", "content": { "application/json": { "schema": { @@ -3779,7 +4004,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A ScoreBatchResponse.", "content": { "application/json": { "schema": { @@ -3804,7 +4029,7 @@ "tags": [ "Scoring" ], - "description": "", + "description": "Score a batch of rows.", "parameters": [], "requestBody": { "content": { @@ -3822,7 +4047,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "A PostTrainingJob.", "content": { "application/json": { "schema": { @@ -3847,7 +4072,7 @@ "tags": [ "PostTraining (Coming Soon)" ], - "description": "", + "description": "Run supervised fine-tuning of a model.", "parameters": [], "requestBody": { "content": { @@ -3908,7 +4133,7 @@ "get": { "responses": { "200": { - "description": "OK", + "description": "A VersionInfo.", "content": { "application/json": { "schema": { @@ -3933,7 +4158,7 @@ "tags": [ "Inspect" ], - "description": "", + "description": "Get the version of the service.", "parameters": [] } } @@ -3999,7 +4224,8 @@ } ] } - } + }, + "description": "The rows to append to the dataset." } }, "additionalProperties": false, @@ -4052,9 +4278,13 @@ "properties": { "type": { "type": "string", + "enum": [ + "json_schema", + "grammar" + ], + "description": "Must be \"grammar\" to identify this format type", "const": "grammar", - "default": "grammar", - "description": "Must be \"grammar\" to identify this format type" + "default": "grammar" }, "bnf": { "type": "object", @@ -4178,9 +4408,13 @@ "properties": { "type": { "type": "string", + "enum": [ + "json_schema", + "grammar" + ], + "description": "Must be \"json_schema\" to identify this format type", "const": "json_schema", - "default": "json_schema", - "description": "Must be \"json_schema\" to identify this format type" + "default": "json_schema" }, "json_schema": { "type": "object", @@ -4698,7 +4932,8 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, "messages_batch": { "type": "array", @@ -4707,22 +4942,27 @@ "items": { "$ref": "#/components/schemas/Message" } - } + }, + "description": "The messages to generate completions for." }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy." }, "tools": { "type": "array", "items": { "$ref": "#/components/schemas/ToolDefinition" - } + }, + "description": "(Optional) List of tool definitions available to the model." }, "tool_config": { - "$ref": "#/components/schemas/ToolConfig" + "$ref": "#/components/schemas/ToolConfig", + "description": "(Optional) Configuration for tool use." }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "logprobs": { "type": "object", @@ -4734,7 +4974,7 @@ } }, "additionalProperties": false, - "title": "LogProbConfig" + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -4837,19 +5077,23 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." }, "content_batch": { "type": "array", "items": { "$ref": "#/components/schemas/InterleavedContent" - } + }, + "description": "The content to generate completions for." }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy." }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "logprobs": { "type": "object", @@ -4861,7 +5105,7 @@ } }, "additionalProperties": false, - "title": "LogProbConfig" + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -4929,7 +5173,8 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to cancel." } }, "additionalProperties": false, @@ -4950,18 +5195,18 @@ "items": { "$ref": "#/components/schemas/Message" }, - "description": "List of messages in the conversation" + "description": "List of messages in the conversation." }, "sampling_params": { "$ref": "#/components/schemas/SamplingParams", - "description": "Parameters to control the sampling strategy" + "description": "Parameters to control the sampling strategy." }, "tools": { "type": "array", "items": { "$ref": "#/components/schemas/ToolDefinition" }, - "description": "(Optional) List of tool definitions available to the model" + "description": "(Optional) List of tool definitions available to the model." }, "tool_choice": { "type": "string", @@ -5181,15 +5426,15 @@ }, "content": { "$ref": "#/components/schemas/InterleavedContent", - "description": "The content to generate a completion for" + "description": "The content to generate a completion for." }, "sampling_params": { "$ref": "#/components/schemas/SamplingParams", - "description": "(Optional) Parameters to control the sampling strategy" + "description": "(Optional) Parameters to control the sampling strategy." }, "response_format": { "$ref": "#/components/schemas/ResponseFormat", - "description": "(Optional) Grammar specification for guided (structured) decoding" + "description": "(Optional) Grammar specification for guided (structured) decoding." }, "stream": { "type": "boolean", @@ -5638,6 +5883,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "inference", "default": "inference" }, @@ -5679,6 +5932,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "memory_retrieval", "default": "memory_retrieval" }, @@ -5767,6 +6028,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "shield_call", "default": "shield_call" }, @@ -5807,6 +6076,14 @@ }, "step_type": { "type": "string", + "enum": [ + "inference", + "tool_execution", + "shield_call", + "memory_retrieval" + ], + "title": "StepType", + "description": "Type of the step in an agent turn.", "const": "tool_execution", "default": "tool_execution" }, @@ -6069,6 +6346,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_complete", "default": "step_complete" }, @@ -6126,6 +6412,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_progress", "default": "step_progress" }, @@ -6161,6 +6456,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "step_start", "default": "step_start" }, @@ -6231,6 +6535,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_awaiting_input", "default": "turn_awaiting_input" }, @@ -6250,6 +6563,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_complete", "default": "turn_complete" }, @@ -6269,6 +6591,15 @@ "properties": { "event_type": { "type": "string", + "enum": [ + "step_start", + "step_complete", + "step_progress", + "turn_start", + "turn_complete", + "turn_awaiting_input" + ], + "title": "AgentTurnResponseEventType", "const": "turn_start", "default": "turn_start" }, @@ -6283,54 +6614,51 @@ ], "title": "AgentTurnResponseTurnStartPayload" }, - "OpenAIResponseInputMessage": { + "OpenAIResponseInput": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput" + }, + { + "$ref": "#/components/schemas/OpenAIResponseMessage" + } + ] + }, + "OpenAIResponseInputFunctionToolCallOutput": { "type": "object", "properties": { - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIResponseInputMessageContent" - } - } - ] + "call_id": { + "type": "string" }, - "role": { - "oneOf": [ - { - "type": "string", - "const": "system" - }, - { - "type": "string", - "const": "developer" - }, - { - "type": "string", - "const": "user" - }, - { - "type": "string", - "const": "assistant" - } - ] + "output": { + "type": "string" }, "type": { "type": "string", - "const": "message", - "default": "message" + "const": "function_call_output", + "default": "function_call_output" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" } }, "additionalProperties": false, "required": [ - "content", - "role" + "call_id", + "output", + "type" ], - "title": "OpenAIResponseInputMessage" + "title": "OpenAIResponseInputFunctionToolCallOutput", + "description": "This represents the output of a function call that gets passed back to the model." }, "OpenAIResponseInputMessageContent": { "oneOf": [ @@ -6405,6 +6733,113 @@ "title": "OpenAIResponseInputMessageContentText" }, "OpenAIResponseInputTool": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIResponseInputToolWebSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFileSearch" + }, + { + "$ref": "#/components/schemas/OpenAIResponseInputToolFunction" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "web_search": "#/components/schemas/OpenAIResponseInputToolWebSearch", + "file_search": "#/components/schemas/OpenAIResponseInputToolFileSearch", + "function": "#/components/schemas/OpenAIResponseInputToolFunction" + } + } + }, + "OpenAIResponseInputToolFileSearch": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "file_search", + "default": "file_search" + }, + "vector_store_id": { + "type": "array", + "items": { + "type": "string" + } + }, + "ranking_options": { + "type": "object", + "properties": { + "ranker": { + "type": "string" + }, + "score_threshold": { + "type": "number", + "default": 0.0 + } + }, + "additionalProperties": false, + "title": "FileSearchRankingOptions" + } + }, + "additionalProperties": false, + "required": [ + "type", + "vector_store_id" + ], + "title": "OpenAIResponseInputToolFileSearch" + }, + "OpenAIResponseInputToolFunction": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "function", + "default": "function" + }, + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "strict": { + "type": "boolean" + } + }, + "additionalProperties": false, + "required": [ + "type", + "name" + ], + "title": "OpenAIResponseInputToolFunction" + }, + "OpenAIResponseInputToolWebSearch": { "type": "object", "properties": { "type": { @@ -6431,6 +6866,146 @@ ], "title": "OpenAIResponseInputToolWebSearch" }, + "OpenAIResponseMessage": { + "type": "object", + "properties": { + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseInputMessageContent" + } + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent" + } + } + ] + }, + "role": { + "oneOf": [ + { + "type": "string", + "const": "system" + }, + { + "type": "string", + "const": "developer" + }, + { + "type": "string", + "const": "user" + }, + { + "type": "string", + "const": "assistant" + } + ] + }, + "type": { + "type": "string", + "const": "message", + "default": "message" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "content", + "role", + "type" + ], + "title": "OpenAIResponseMessage", + "description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios." + }, + "OpenAIResponseOutputMessageContent": { + "type": "object", + "properties": { + "text": { + "type": "string" + }, + "type": { + "type": "string", + "const": "output_text", + "default": "output_text" + } + }, + "additionalProperties": false, + "required": [ + "text", + "type" + ], + "title": "OpenAIResponseOutputMessageContentOutputText" + }, + "OpenAIResponseOutputMessageFunctionToolCall": { + "type": "object", + "properties": { + "arguments": { + "type": "string" + }, + "call_id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function_call", + "default": "function_call" + }, + "id": { + "type": "string" + }, + "status": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "arguments", + "call_id", + "name", + "type", + "id", + "status" + ], + "title": "OpenAIResponseOutputMessageFunctionToolCall" + }, + "OpenAIResponseOutputMessageWebSearchToolCall": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "status": { + "type": "string" + }, + "type": { + "type": "string", + "const": "web_search_call", + "default": "web_search_call" + } + }, + "additionalProperties": false, + "required": [ + "id", + "status", + "type" + ], + "title": "OpenAIResponseOutputMessageWebSearchToolCall" + }, "CreateOpenaiResponseRequest": { "type": "object", "properties": { @@ -6442,7 +7017,7 @@ { "type": "array", "items": { - "$ref": "#/components/schemas/OpenAIResponseInputMessage" + "$ref": "#/components/schemas/OpenAIResponseInput" } } ], @@ -6560,98 +7135,24 @@ "OpenAIResponseOutput": { "oneOf": [ { - "$ref": "#/components/schemas/OpenAIResponseOutputMessage" + "$ref": "#/components/schemas/OpenAIResponseMessage" }, { "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" } ], "discriminator": { "propertyName": "type", "mapping": { - "message": "#/components/schemas/OpenAIResponseOutputMessage", - "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" + "message": "#/components/schemas/OpenAIResponseMessage", + "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall", + "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" } } }, - "OpenAIResponseOutputMessage": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "content": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent" - } - }, - "role": { - "type": "string", - "const": "assistant", - "default": "assistant" - }, - "status": { - "type": "string" - }, - "type": { - "type": "string", - "const": "message", - "default": "message" - } - }, - "additionalProperties": false, - "required": [ - "id", - "content", - "role", - "status", - "type" - ], - "title": "OpenAIResponseOutputMessage" - }, - "OpenAIResponseOutputMessageContent": { - "type": "object", - "properties": { - "text": { - "type": "string" - }, - "type": { - "type": "string", - "const": "output_text", - "default": "output_text" - } - }, - "additionalProperties": false, - "required": [ - "text", - "type" - ], - "title": "OpenAIResponseOutputMessageContentOutputText" - }, - "OpenAIResponseOutputMessageWebSearchToolCall": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "status": { - "type": "string" - }, - "type": { - "type": "string", - "const": "web_search_call", - "default": "web_search_call" - } - }, - "additionalProperties": false, - "required": [ - "id", - "status", - "type" - ], - "title": "OpenAIResponseOutputMessageWebSearchToolCall" - }, "OpenAIResponseObjectStream": { "oneOf": [ { @@ -6712,19 +7213,19 @@ "properties": { "bucket": { "type": "string", - "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)" + "description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)." }, "key": { "type": "string", - "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)" + "description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)." }, "mime_type": { "type": "string", - "description": "MIME type of the file" + "description": "MIME type of the file." }, "size": { "type": "integer", - "description": "File size in bytes" + "description": "File size in bytes." } }, "additionalProperties": false, @@ -6876,7 +7377,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "basic", "default": "basic" }, @@ -6889,7 +7390,8 @@ }, "additionalProperties": false, "required": [ - "type" + "type", + "aggregation_functions" ], "title": "BasicScoringFnParams" }, @@ -6941,7 +7443,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "llm_as_judge", "default": "llm_as_judge" }, @@ -6967,7 +7469,9 @@ "additionalProperties": false, "required": [ "type", - "judge_model" + "judge_model", + "judge_score_regexes", + "aggregation_functions" ], "title": "LLMAsJudgeScoringFnParams" }, @@ -7005,7 +7509,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/ScoringFnParamsType", "const": "regex_parser", "default": "regex_parser" }, @@ -7024,7 +7528,9 @@ }, "additionalProperties": false, "required": [ - "type" + "type", + "parsing_regexes", + "aggregation_functions" ], "title": "RegexParserScoringFnParams" }, @@ -7049,6 +7555,15 @@ } } }, + "ScoringFnParamsType": { + "type": "string", + "enum": [ + "llm_as_judge", + "regex_parser", + "basic" + ], + "title": "ScoringFnParamsType" + }, "EvaluateRowsRequest": { "type": "object", "properties": { @@ -7317,6 +7832,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "benchmark", "default": "benchmark" }, @@ -7358,7 +7884,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "dataset_id", @@ -7367,6 +7892,482 @@ ], "title": "Benchmark" }, + "OpenAIAssistantMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the model's response" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the assistant message participant." + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" + }, + "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object." + } + }, + "additionalProperties": false, + "required": [ + "role" + ], + "title": "OpenAIAssistantMessageParam", + "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." + }, + "OpenAIChatCompletionContentPartImageParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "image_url", + "default": "image_url" + }, + "image_url": { + "$ref": "#/components/schemas/OpenAIImageURL" + } + }, + "additionalProperties": false, + "required": [ + "type", + "image_url" + ], + "title": "OpenAIChatCompletionContentPartImageParam" + }, + "OpenAIChatCompletionContentPartParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam", + "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" + } + } + }, + "OpenAIChatCompletionContentPartTextParam": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "text", + "default": "text" + }, + "text": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "type", + "text" + ], + "title": "OpenAIChatCompletionContentPartTextParam" + }, + "OpenAIChatCompletionToolCall": { + "type": "object", + "properties": { + "index": { + "type": "integer" + }, + "id": { + "type": "string" + }, + "type": { + "type": "string", + "const": "function", + "default": "function" + }, + "function": { + "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "OpenAIChatCompletionToolCall" + }, + "OpenAIChatCompletionToolCallFunction": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionToolCallFunction" + }, + "OpenAIChoice": { + "type": "object", + "properties": { + "message": { + "$ref": "#/components/schemas/OpenAIMessageParam", + "description": "The message from the model" + }, + "finish_reason": { + "type": "string", + "description": "The reason the model stopped generating" + }, + "index": { + "type": "integer", + "description": "The index of the choice" + }, + "logprobs": { + "$ref": "#/components/schemas/OpenAIChoiceLogprobs", + "description": "(Optional) The log probabilities for the tokens in the message" + } + }, + "additionalProperties": false, + "required": [ + "message", + "finish_reason", + "index" + ], + "title": "OpenAIChoice", + "description": "A choice from an OpenAI-compatible chat completion response." + }, + "OpenAIChoiceLogprobs": { + "type": "object", + "properties": { + "content": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "description": "(Optional) The log probabilities for the tokens in the message" + }, + "refusal": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + }, + "description": "(Optional) The log probabilities for the tokens in the message" + } + }, + "additionalProperties": false, + "title": "OpenAIChoiceLogprobs", + "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." + }, + "OpenAIDeveloperMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "developer", + "default": "developer", + "description": "Must be \"developer\" to identify this as a developer message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the developer message" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the developer message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIDeveloperMessageParam", + "description": "A message from the developer in an OpenAI-compatible chat completion request." + }, + "OpenAIImageURL": { + "type": "object", + "properties": { + "url": { + "type": "string" + }, + "detail": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "url" + ], + "title": "OpenAIImageURL" + }, + "OpenAIMessageParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIUserMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAISystemMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIToolMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIDeveloperMessageParam" + } + ], + "discriminator": { + "propertyName": "role", + "mapping": { + "user": "#/components/schemas/OpenAIUserMessageParam", + "system": "#/components/schemas/OpenAISystemMessageParam", + "assistant": "#/components/schemas/OpenAIAssistantMessageParam", + "tool": "#/components/schemas/OpenAIToolMessageParam", + "developer": "#/components/schemas/OpenAIDeveloperMessageParam" + } + } + }, + "OpenAISystemMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "system", + "default": "system", + "description": "Must be \"system\" to identify this as a system message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." + }, + "name": { + "type": "string", + "description": "(Optional) The name of the system message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAISystemMessageParam", + "description": "A system message providing instructions or context to the model." + }, + "OpenAITokenLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + }, + "top_logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITopLogProb" + } + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob", + "top_logprobs" + ], + "title": "OpenAITokenLogProb", + "description": "The log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenAIToolMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "tool", + "default": "tool", + "description": "Must be \"tool\" to identify this as a tool response" + }, + "tool_call_id": { + "type": "string", + "description": "Unique identifier for the tool call this response is for" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The response content from the tool" + } + }, + "additionalProperties": false, + "required": [ + "role", + "tool_call_id", + "content" + ], + "title": "OpenAIToolMessageParam", + "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request." + }, + "OpenAITopLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob" + ], + "title": "OpenAITopLogProb", + "description": "The top log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenAIUserMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "user", + "default": "user", + "description": "Must be \"user\" to identify this as a user message" + }, + "content": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" + } + } + ], + "description": "The content of the message, which can include text and other media" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the user message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIUserMessageParam", + "description": "A message from the user in an OpenAI-compatible chat completion request." + }, + "OpenAICompletionWithInputMessages": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion", + "default": "chat.completion", + "description": "The object type, which will be \"chat.completion\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + }, + "input_messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIMessageParam" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model", + "input_messages" + ], + "title": "OpenAICompletionWithInputMessages" + }, "DataSource": { "oneOf": [ { @@ -7398,6 +8399,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "dataset", "default": "dataset" }, @@ -7443,7 +8455,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "purpose", @@ -7573,6 +8584,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "model", "default": "model" }, @@ -7609,7 +8631,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "metadata", @@ -7808,6 +8829,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "scoring_function", "default": "scoring_function" }, @@ -7849,7 +8881,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "metadata", @@ -7901,6 +8932,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "shield", "default": "shield" }, @@ -7933,7 +8975,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type" ], @@ -8005,10 +9046,12 @@ "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to return in the tree." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -8113,6 +9156,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "tool", "default": "tool" }, @@ -8160,7 +9214,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "toolgroup_id", @@ -8193,6 +9246,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "tool_group", "default": "tool_group" }, @@ -8228,7 +9292,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type" ], @@ -8395,6 +9458,17 @@ }, "type": { "type": "string", + "enum": [ + "model", + "shield", + "vector_db", + "dataset", + "scoring_function", + "benchmark", + "tool", + "tool_group" + ], + "title": "ResourceType", "const": "vector_db", "default": "vector_db" }, @@ -8408,7 +9482,6 @@ "additionalProperties": false, "required": [ "identifier", - "provider_resource_id", "provider_id", "type", "embedding_model", @@ -8530,7 +9603,8 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to insert the chunks into." }, "chunks": { "type": "array", @@ -8572,10 +9646,12 @@ "metadata" ], "title": "Chunk" - } + }, + "description": "The chunks to insert." }, "ttl_seconds": { - "type": "integer" + "type": "integer", + "description": "The time to live of the chunks." } }, "additionalProperties": false, @@ -8662,7 +9738,8 @@ "type": "object", "properties": { "tool_name": { - "type": "string" + "type": "string", + "description": "The name of the tool to invoke." }, "kwargs": { "type": "object", @@ -8687,7 +9764,8 @@ "type": "object" } ] - } + }, + "description": "A dictionary of arguments to pass to the tool." } }, "additionalProperties": false, @@ -8808,38 +9886,6 @@ ], "title": "Job" }, - "ListAgentSessionsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Session" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListAgentSessionsResponse" - }, - "ListAgentsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Agent" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListAgentsResponse" - }, "BucketResponse": { "type": "object", "properties": { @@ -8887,6 +9933,91 @@ ], "title": "ListBenchmarksResponse" }, + "Order": { + "type": "string", + "enum": [ + "asc", + "desc" + ], + "title": "Order" + }, + "ListOpenAIChatCompletionResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion", + "default": "chat.completion", + "description": "The object type, which will be \"chat.completion\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + }, + "input_messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIMessageParam" + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model", + "input_messages" + ], + "title": "OpenAICompletionWithInputMessages" + } + }, + "has_more": { + "type": "boolean" + }, + "first_id": { + "type": "string" + }, + "last_id": { + "type": "string" + }, + "object": { + "type": "string", + "const": "list", + "default": "list" + } + }, + "additionalProperties": false, + "required": [ + "data", + "has_more", + "first_id", + "last_id", + "object" + ], + "title": "ListOpenAIChatCompletionResponse" + }, "ListDatasetsResponse": { "type": "object", "properties": { @@ -9110,6 +10241,15 @@ } } }, + "EventType": { + "type": "string", + "enum": [ + "unstructured_log", + "structured_log", + "metric" + ], + "title": "EventType" + }, "LogSeverity": { "type": "string", "enum": [ @@ -9158,7 +10298,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "metric", "default": "metric" }, @@ -9195,7 +10335,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/StructuredLogType", "const": "span_end", "default": "span_end" }, @@ -9214,7 +10354,7 @@ "type": "object", "properties": { "type": { - "type": "string", + "$ref": "#/components/schemas/StructuredLogType", "const": "span_start", "default": "span_start" }, @@ -9268,7 +10408,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "structured_log", "default": "structured_log" }, @@ -9303,6 +10443,14 @@ } } }, + "StructuredLogType": { + "type": "string", + "enum": [ + "span_start", + "span_end" + ], + "title": "StructuredLogType" + }, "UnstructuredLogEvent": { "type": "object", "properties": { @@ -9339,7 +10487,7 @@ } }, "type": { - "type": "string", + "$ref": "#/components/schemas/EventType", "const": "unstructured_log", "default": "unstructured_log" }, @@ -9365,10 +10513,12 @@ "type": "object", "properties": { "event": { - "$ref": "#/components/schemas/Event" + "$ref": "#/components/schemas/Event", + "description": "The event to log." }, "ttl_seconds": { - "type": "integer" + "type": "integer", + "description": "The time to live of the event." } }, "additionalProperties": false, @@ -9378,192 +10528,6 @@ ], "title": "LogEventRequest" }, - "OpenAIAssistantMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "assistant", - "default": "assistant", - "description": "Must be \"assistant\" to identify this as the model's response" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the model's response" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the assistant message participant." - }, - "tool_calls": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionToolCall" - }, - "description": "List of tool calls. Each tool call is an OpenAIChatCompletionToolCall object." - } - }, - "additionalProperties": false, - "required": [ - "role" - ], - "title": "OpenAIAssistantMessageParam", - "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." - }, - "OpenAIChatCompletionContentPartImageParam": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "image_url", - "default": "image_url" - }, - "image_url": { - "$ref": "#/components/schemas/OpenAIImageURL" - } - }, - "additionalProperties": false, - "required": [ - "type", - "image_url" - ], - "title": "OpenAIChatCompletionContentPartImageParam" - }, - "OpenAIChatCompletionContentPartParam": { - "oneOf": [ - { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam" - }, - { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "text": "#/components/schemas/OpenAIChatCompletionContentPartTextParam", - "image_url": "#/components/schemas/OpenAIChatCompletionContentPartImageParam" - } - } - }, - "OpenAIChatCompletionContentPartTextParam": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "text", - "default": "text" - }, - "text": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "type", - "text" - ], - "title": "OpenAIChatCompletionContentPartTextParam" - }, - "OpenAIChatCompletionToolCall": { - "type": "object", - "properties": { - "index": { - "type": "integer" - }, - "id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "function", - "default": "function" - }, - "function": { - "$ref": "#/components/schemas/OpenAIChatCompletionToolCallFunction" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "OpenAIChatCompletionToolCall" - }, - "OpenAIChatCompletionToolCallFunction": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "additionalProperties": false, - "title": "OpenAIChatCompletionToolCallFunction" - }, - "OpenAIDeveloperMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "developer", - "default": "developer", - "description": "Must be \"developer\" to identify this as a developer message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the developer message" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the developer message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAIDeveloperMessageParam", - "description": "A message from the developer in an OpenAI-compatible chat completion request." - }, - "OpenAIImageURL": { - "type": "object", - "properties": { - "url": { - "type": "string" - }, - "detail": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "url" - ], - "title": "OpenAIImageURL" - }, "OpenAIJSONSchema": { "type": "object", "properties": { @@ -9608,35 +10572,6 @@ ], "title": "OpenAIJSONSchema" }, - "OpenAIMessageParam": { - "oneOf": [ - { - "$ref": "#/components/schemas/OpenAIUserMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAISystemMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIAssistantMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIToolMessageParam" - }, - { - "$ref": "#/components/schemas/OpenAIDeveloperMessageParam" - } - ], - "discriminator": { - "propertyName": "role", - "mapping": { - "user": "#/components/schemas/OpenAIUserMessageParam", - "system": "#/components/schemas/OpenAISystemMessageParam", - "assistant": "#/components/schemas/OpenAIAssistantMessageParam", - "tool": "#/components/schemas/OpenAIToolMessageParam", - "developer": "#/components/schemas/OpenAIDeveloperMessageParam" - } - } - }, "OpenAIResponseFormatJSONObject": { "type": "object", "properties": { @@ -9707,115 +10642,6 @@ ], "title": "OpenAIResponseFormatText" }, - "OpenAISystemMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "system", - "default": "system", - "description": "Must be \"system\" to identify this as a system message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." - }, - "name": { - "type": "string", - "description": "(Optional) The name of the system message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAISystemMessageParam", - "description": "A system message providing instructions or context to the model." - }, - "OpenAIToolMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "tool", - "default": "tool", - "description": "Must be \"tool\" to identify this as a tool response" - }, - "tool_call_id": { - "type": "string", - "description": "Unique identifier for the tool call this response is for" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The response content from the tool" - } - }, - "additionalProperties": false, - "required": [ - "role", - "tool_call_id", - "content" - ], - "title": "OpenAIToolMessageParam", - "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request." - }, - "OpenAIUserMessageParam": { - "type": "object", - "properties": { - "role": { - "type": "string", - "const": "user", - "default": "user", - "description": "Must be \"user\" to identify this as a user message" - }, - "content": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAIChatCompletionContentPartParam" - } - } - ], - "description": "The content of the message, which can include text and other media" - }, - "name": { - "type": "string", - "description": "(Optional) The name of the user message participant." - } - }, - "additionalProperties": false, - "required": [ - "role", - "content" - ], - "title": "OpenAIUserMessageParam", - "description": "A message from the user in an OpenAI-compatible chat completion request." - }, "OpenaiChatCompletionRequest": { "type": "object", "properties": { @@ -9828,11 +10654,11 @@ "items": { "$ref": "#/components/schemas/OpenAIMessageParam" }, - "description": "List of messages in the conversation" + "description": "List of messages in the conversation." }, "frequency_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "function_call": { "oneOf": [ @@ -9865,7 +10691,7 @@ } } ], - "description": "(Optional) The function call to use" + "description": "(Optional) The function call to use." }, "functions": { "type": "array", @@ -9894,46 +10720,46 @@ ] } }, - "description": "(Optional) List of functions to use" + "description": "(Optional) List of functions to use." }, "logit_bias": { "type": "object", "additionalProperties": { "type": "number" }, - "description": "(Optional) The logit bias to use" + "description": "(Optional) The logit bias to use." }, "logprobs": { "type": "boolean", - "description": "(Optional) The log probabilities to use" + "description": "(Optional) The log probabilities to use." }, "max_completion_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "max_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "n": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "parallel_tool_calls": { "type": "boolean", - "description": "(Optional) Whether to parallelize tool calls" + "description": "(Optional) Whether to parallelize tool calls." }, "presence_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "response_format": { "$ref": "#/components/schemas/OpenAIResponseFormatParam", - "description": "(Optional) The response format to use" + "description": "(Optional) The response format to use." }, "seed": { "type": "integer", - "description": "(Optional) The seed to use" + "description": "(Optional) The seed to use." }, "stop": { "oneOf": [ @@ -9947,11 +10773,11 @@ } } ], - "description": "(Optional) The stop tokens to use" + "description": "(Optional) The stop tokens to use." }, "stream": { "type": "boolean", - "description": "(Optional) Whether to stream the response" + "description": "(Optional) Whether to stream the response." }, "stream_options": { "type": "object", @@ -9977,11 +10803,11 @@ } ] }, - "description": "(Optional) The stream options to use" + "description": "(Optional) The stream options to use." }, "temperature": { "type": "number", - "description": "(Optional) The temperature to use" + "description": "(Optional) The temperature to use." }, "tool_choice": { "oneOf": [ @@ -10014,7 +10840,7 @@ } } ], - "description": "(Optional) The tool choice to use" + "description": "(Optional) The tool choice to use." }, "tools": { "type": "array", @@ -10043,19 +10869,19 @@ ] } }, - "description": "(Optional) The tools to use" + "description": "(Optional) The tools to use." }, "top_logprobs": { "type": "integer", - "description": "(Optional) The top log probabilities to use" + "description": "(Optional) The top log probabilities to use." }, "top_p": { "type": "number", - "description": "(Optional) The top p to use" + "description": "(Optional) The top p to use." }, "user": { "type": "string", - "description": "(Optional) The user to use" + "description": "(Optional) The user to use." } }, "additionalProperties": false, @@ -10145,35 +10971,6 @@ "title": "OpenAIChatCompletionChunk", "description": "Chunk from a streaming response to an OpenAI-compatible chat completion request." }, - "OpenAIChoice": { - "type": "object", - "properties": { - "message": { - "$ref": "#/components/schemas/OpenAIMessageParam", - "description": "The message from the model" - }, - "finish_reason": { - "type": "string", - "description": "The reason the model stopped generating" - }, - "index": { - "type": "integer", - "description": "The index of the choice" - }, - "logprobs": { - "$ref": "#/components/schemas/OpenAIChoiceLogprobs", - "description": "(Optional) The log probabilities for the tokens in the message" - } - }, - "additionalProperties": false, - "required": [ - "message", - "finish_reason", - "index" - ], - "title": "OpenAIChoice", - "description": "A choice from an OpenAI-compatible chat completion response." - }, "OpenAIChoiceDelta": { "type": "object", "properties": { @@ -10201,28 +10998,6 @@ "title": "OpenAIChoiceDelta", "description": "A delta from an OpenAI-compatible chat completion streaming response." }, - "OpenAIChoiceLogprobs": { - "type": "object", - "properties": { - "content": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITokenLogProb" - }, - "description": "(Optional) The log probabilities for the tokens in the message" - }, - "refusal": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITokenLogProb" - }, - "description": "(Optional) The log probabilities for the tokens in the message" - } - }, - "additionalProperties": false, - "title": "OpenAIChoiceLogprobs", - "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." - }, "OpenAIChunkChoice": { "type": "object", "properties": { @@ -10252,61 +11027,6 @@ "title": "OpenAIChunkChoice", "description": "A chunk choice from an OpenAI-compatible chat completion streaming response." }, - "OpenAITokenLogProb": { - "type": "object", - "properties": { - "token": { - "type": "string" - }, - "bytes": { - "type": "array", - "items": { - "type": "integer" - } - }, - "logprob": { - "type": "number" - }, - "top_logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/OpenAITopLogProb" - } - } - }, - "additionalProperties": false, - "required": [ - "token", - "logprob", - "top_logprobs" - ], - "title": "OpenAITokenLogProb", - "description": "The log probability for a token from an OpenAI-compatible chat completion response." - }, - "OpenAITopLogProb": { - "type": "object", - "properties": { - "token": { - "type": "string" - }, - "bytes": { - "type": "array", - "items": { - "type": "integer" - } - }, - "logprob": { - "type": "number" - } - }, - "additionalProperties": false, - "required": [ - "token", - "logprob" - ], - "title": "OpenAITopLogProb", - "description": "The top log probability for a token from an OpenAI-compatible chat completion response." - }, "OpenaiCompletionRequest": { "type": "object", "properties": { @@ -10341,46 +11061,46 @@ } } ], - "description": "The prompt to generate a completion for" + "description": "The prompt to generate a completion for." }, "best_of": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "echo": { "type": "boolean", - "description": "(Optional) Whether to echo the prompt" + "description": "(Optional) Whether to echo the prompt." }, "frequency_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "logit_bias": { "type": "object", "additionalProperties": { "type": "number" }, - "description": "(Optional) The logit bias to use" + "description": "(Optional) The logit bias to use." }, "logprobs": { "type": "boolean", - "description": "(Optional) The log probabilities to use" + "description": "(Optional) The log probabilities to use." }, "max_tokens": { "type": "integer", - "description": "(Optional) The maximum number of tokens to generate" + "description": "(Optional) The maximum number of tokens to generate." }, "n": { "type": "integer", - "description": "(Optional) The number of completions to generate" + "description": "(Optional) The number of completions to generate." }, "presence_penalty": { "type": "number", - "description": "(Optional) The penalty for repeated tokens" + "description": "(Optional) The penalty for repeated tokens." }, "seed": { "type": "integer", - "description": "(Optional) The seed to use" + "description": "(Optional) The seed to use." }, "stop": { "oneOf": [ @@ -10394,11 +11114,11 @@ } } ], - "description": "(Optional) The stop tokens to use" + "description": "(Optional) The stop tokens to use." }, "stream": { "type": "boolean", - "description": "(Optional) Whether to stream the response" + "description": "(Optional) Whether to stream the response." }, "stream_options": { "type": "object", @@ -10424,19 +11144,19 @@ } ] }, - "description": "(Optional) The stream options to use" + "description": "(Optional) The stream options to use." }, "temperature": { "type": "number", - "description": "(Optional) The temperature to use" + "description": "(Optional) The temperature to use." }, "top_p": { "type": "number", - "description": "(Optional) The top p to use" + "description": "(Optional) The top p to use." }, "user": { "type": "string", - "description": "(Optional) The user to use" + "description": "(Optional) The user to use." }, "guided_choice": { "type": "array", @@ -10729,16 +11449,20 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to create." }, "finetuned_model": { - "type": "string" + "type": "string", + "description": "The model to fine-tune." }, "algorithm_config": { - "$ref": "#/components/schemas/DPOAlignmentConfig" + "$ref": "#/components/schemas/DPOAlignmentConfig", + "description": "The algorithm configuration." }, "training_config": { - "$ref": "#/components/schemas/TrainingConfig" + "$ref": "#/components/schemas/TrainingConfig", + "description": "The training configuration." }, "hyperparam_search_config": { "type": "object", @@ -10763,7 +11487,8 @@ "type": "object" } ] - } + }, + "description": "The hyperparam search configuration." }, "logger_config": { "type": "object", @@ -10788,7 +11513,8 @@ "type": "object" } ] - } + }, + "description": "The logger configuration." } }, "additionalProperties": false, @@ -10862,24 +11588,34 @@ "type": "object", "properties": { "query_generator_config": { - "$ref": "#/components/schemas/RAGQueryGeneratorConfig" + "$ref": "#/components/schemas/RAGQueryGeneratorConfig", + "description": "Configuration for the query generator." }, "max_tokens_in_context": { "type": "integer", - "default": 4096 + "default": 4096, + "description": "Maximum number of tokens in the context." }, "max_chunks": { "type": "integer", - "default": 5 + "default": 5, + "description": "Maximum number of chunks to retrieve." + }, + "chunk_template": { + "type": "string", + "default": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n", + "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\"" } }, "additionalProperties": false, "required": [ "query_generator_config", "max_tokens_in_context", - "max_chunks" + "max_chunks", + "chunk_template" ], - "title": "RAGQueryConfig" + "title": "RAGQueryConfig", + "description": "Configuration for the RAG query generation." }, "RAGQueryGeneratorConfig": { "oneOf": [ @@ -10963,10 +11699,12 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to query." }, "query": { - "$ref": "#/components/schemas/InterleavedContent" + "$ref": "#/components/schemas/InterleavedContent", + "description": "The query to search for." }, "params": { "type": "object", @@ -10991,7 +11729,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the query." } }, "additionalProperties": false, @@ -11060,6 +11799,147 @@ ], "title": "QueryChunksResponse" }, + "QueryMetricsRequest": { + "type": "object", + "properties": { + "start_time": { + "type": "integer", + "description": "The start time of the metric to query." + }, + "end_time": { + "type": "integer", + "description": "The end time of the metric to query." + }, + "granularity": { + "type": "string", + "description": "The granularity of the metric to query." + }, + "query_type": { + "type": "string", + "enum": [ + "range", + "instant" + ], + "description": "The type of query to perform." + }, + "label_matchers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": [ + "=", + "!=", + "=~", + "!~" + ], + "title": "MetricLabelOperator", + "default": "=" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value", + "operator" + ], + "title": "MetricLabelMatcher" + }, + "description": "The label matchers to apply to the metric." + } + }, + "additionalProperties": false, + "required": [ + "start_time", + "query_type" + ], + "title": "QueryMetricsRequest" + }, + "MetricDataPoint": { + "type": "object", + "properties": { + "timestamp": { + "type": "integer" + }, + "value": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "timestamp", + "value" + ], + "title": "MetricDataPoint" + }, + "MetricLabel": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value" + ], + "title": "MetricLabel" + }, + "MetricSeries": { + "type": "object", + "properties": { + "metric": { + "type": "string" + }, + "labels": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricLabel" + } + }, + "values": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricDataPoint" + } + } + }, + "additionalProperties": false, + "required": [ + "metric", + "labels", + "values" + ], + "title": "MetricSeries" + }, + "QueryMetricsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricSeries" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "QueryMetricsResponse" + }, "QueryCondition": { "type": "object", "properties": { @@ -11117,16 +11997,19 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the spans." }, "attributes_to_return": { "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to return in the spans." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -11159,19 +12042,23 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the traces." }, "limit": { - "type": "integer" + "type": "integer", + "description": "The limit of traces to return." }, "offset": { - "type": "integer" + "type": "integer", + "description": "The offset of the traces to return." }, "order_by": { "type": "array", "items": { "type": "string" - } + }, + "description": "The order by of the traces to return." } }, "additionalProperties": false, @@ -11197,22 +12084,27 @@ "type": "object", "properties": { "benchmark_id": { - "type": "string" + "type": "string", + "description": "The ID of the benchmark to register." }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to use for the benchmark." }, "scoring_functions": { "type": "array", "items": { "type": "string" - } + }, + "description": "The scoring functions to use for the benchmark." }, "provider_benchmark_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider benchmark to use for the benchmark." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the benchmark." }, "metadata": { "type": "object", @@ -11237,7 +12129,8 @@ "type": "object" } ] - } + }, + "description": "The metadata to use for the benchmark." } }, "additionalProperties": false, @@ -11258,7 +12151,7 @@ "eval/question-answer", "eval/messages-answer" ], - "description": "The purpose of the dataset. One of - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/question-answer\": The dataset contains a question column and an answer column for evaluation. { \"question\": \"What is the capital of France?\", \"answer\": \"Paris\" } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column for evaluation. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, my name is John Doe.\"}, {\"role\": \"assistant\", \"content\": \"Hello, John Doe. How can I help you today?\"}, {\"role\": \"user\", \"content\": \"What's my name?\"}, ], \"answer\": \"John Doe\" }" + "description": "The purpose of the dataset. One of: - \"post-training/messages\": The dataset contains a messages column with list of messages for post-training. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, world!\"}, {\"role\": \"assistant\", \"content\": \"Hello, world!\"}, ] } - \"eval/question-answer\": The dataset contains a question column and an answer column for evaluation. { \"question\": \"What is the capital of France?\", \"answer\": \"Paris\" } - \"eval/messages-answer\": The dataset contains a messages column with list of messages and an answer column for evaluation. { \"messages\": [ {\"role\": \"user\", \"content\": \"Hello, my name is John Doe.\"}, {\"role\": \"assistant\", \"content\": \"Hello, John Doe. How can I help you today?\"}, {\"role\": \"user\", \"content\": \"What's my name?\"}, ], \"answer\": \"John Doe\" }" }, "source": { "$ref": "#/components/schemas/DataSource", @@ -11288,7 +12181,7 @@ } ] }, - "description": "The metadata for the dataset. - E.g. {\"description\": \"My dataset\"}" + "description": "The metadata for the dataset. - E.g. {\"description\": \"My dataset\"}." }, "dataset_id": { "type": "string", @@ -11306,13 +12199,16 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to register." }, "provider_model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model in the provider." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "metadata": { "type": "object", @@ -11337,10 +12233,12 @@ "type": "object" } ] - } + }, + "description": "Any additional metadata for this model." }, "model_type": { - "$ref": "#/components/schemas/ModelType" + "$ref": "#/components/schemas/ModelType", + "description": "The type of model to register." } }, "additionalProperties": false, @@ -11353,22 +12251,28 @@ "type": "object", "properties": { "scoring_fn_id": { - "type": "string" + "type": "string", + "description": "The ID of the scoring function to register." }, "description": { - "type": "string" + "type": "string", + "description": "The description of the scoring function." }, "return_type": { - "$ref": "#/components/schemas/ParamType" + "$ref": "#/components/schemas/ParamType", + "description": "The return type of the scoring function." }, "provider_scoring_fn_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider scoring function to use for the scoring function." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the scoring function." }, "params": { - "$ref": "#/components/schemas/ScoringFnParams" + "$ref": "#/components/schemas/ScoringFnParams", + "description": "The parameters for the scoring function for benchmark eval, these can be overridden for app eval." } }, "additionalProperties": false, @@ -11383,13 +12287,16 @@ "type": "object", "properties": { "shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield to register." }, "provider_shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield in the provider." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "params": { "type": "object", @@ -11414,7 +12321,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the shield." } }, "additionalProperties": false, @@ -11427,13 +12335,16 @@ "type": "object", "properties": { "toolgroup_id": { - "type": "string" + "type": "string", + "description": "The ID of the tool group to register." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The ID of the provider to use for the tool group." }, "mcp_endpoint": { - "$ref": "#/components/schemas/URL" + "$ref": "#/components/schemas/URL", + "description": "The MCP endpoint to use for the tool group." }, "args": { "type": "object", @@ -11458,7 +12369,8 @@ "type": "object" } ] - } + }, + "description": "A dictionary of arguments to pass to the tool group." } }, "additionalProperties": false, @@ -11472,19 +12384,24 @@ "type": "object", "properties": { "vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database to register." }, "embedding_model": { - "type": "string" + "type": "string", + "description": "The embedding model to use." }, "embedding_dimension": { - "type": "integer" + "type": "integer", + "description": "The dimension of the embedding model." }, "provider_id": { - "type": "string" + "type": "string", + "description": "The identifier of the provider." }, "provider_vector_db_id": { - "type": "string" + "type": "string", + "description": "The identifier of the vector database in the provider." } }, "additionalProperties": false, @@ -11533,13 +12450,15 @@ "type": "object", "properties": { "shield_id": { - "type": "string" + "type": "string", + "description": "The identifier of the shield to run." }, "messages": { "type": "array", "items": { "$ref": "#/components/schemas/Message" - } + }, + "description": "The messages to run the shield on." }, "params": { "type": "object", @@ -11564,7 +12483,8 @@ "type": "object" } ] - } + }, + "description": "The parameters of the shield." } }, "additionalProperties": false, @@ -11592,19 +12512,23 @@ "type": "array", "items": { "$ref": "#/components/schemas/QueryCondition" - } + }, + "description": "The attribute filters to apply to the spans." }, "attributes_to_save": { "type": "array", "items": { "type": "string" - } + }, + "description": "The attributes to save to the dataset." }, "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to save the spans to." }, "max_depth": { - "type": "integer" + "type": "integer", + "description": "The maximum depth of the tree." } }, "additionalProperties": false, @@ -11691,7 +12615,8 @@ "type": "object", "properties": { "dataset_id": { - "type": "string" + "type": "string", + "description": "The ID of the dataset to score." }, "scoring_functions": { "type": "object", @@ -11704,10 +12629,12 @@ "type": "null" } ] - } + }, + "description": "The scoring functions to use for the scoring." }, "save_results_dataset": { - "type": "boolean" + "type": "boolean", + "description": "Whether to save the results to a dataset." } }, "additionalProperties": false, @@ -11827,10 +12754,12 @@ "type": "object", "properties": { "job_uuid": { - "type": "string" + "type": "string", + "description": "The UUID of the job to create." }, "training_config": { - "$ref": "#/components/schemas/TrainingConfig" + "$ref": "#/components/schemas/TrainingConfig", + "description": "The training configuration." }, "hyperparam_search_config": { "type": "object", @@ -11855,7 +12784,8 @@ "type": "object" } ] - } + }, + "description": "The hyperparam search configuration." }, "logger_config": { "type": "object", @@ -11880,16 +12810,20 @@ "type": "object" } ] - } + }, + "description": "The logger configuration." }, "model": { - "type": "string" + "type": "string", + "description": "The model to fine-tune." }, "checkpoint_dir": { - "type": "string" + "type": "string", + "description": "The directory to save checkpoint(s) to." }, "algorithm_config": { - "$ref": "#/components/schemas/AlgorithmConfig" + "$ref": "#/components/schemas/AlgorithmConfig", + "description": "The algorithm configuration." } }, "additionalProperties": false, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index cb73919d9..012610d02 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -27,10 +27,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - DatasetIO - description: '' + description: Append rows to a dataset. parameters: - name: dataset_id in: path + description: >- + The ID of the dataset to append the rows to. required: true schema: type: string @@ -44,7 +46,8 @@ paths: post: responses: '200': - description: OK + description: >- + A BatchChatCompletionResponse with the full completions. content: application/json: schema: @@ -61,7 +64,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inference - description: '' + description: >- + Generate chat completions for a batch of messages using the specified model. parameters: [] requestBody: content: @@ -73,7 +77,8 @@ paths: post: responses: '200': - description: OK + description: >- + A BatchCompletionResponse with the full completions. content: application/json: schema: @@ -90,7 +95,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inference - description: '' + description: >- + Generate completions for a batch of content using the specified model. parameters: [] requestBody: content: @@ -115,7 +121,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Cancel a training job. parameters: [] requestBody: content: @@ -129,7 +135,7 @@ paths: '200': description: >- If stream=False, returns a ChatCompletionResponse with the full completion. - If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk + If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk. content: application/json: schema: @@ -164,7 +170,7 @@ paths: '200': description: >- If stream=False, returns a CompletionResponse with the full completion. - If stream=True, returns an SSE event stream of CompletionResponseStreamChunk + If stream=True, returns an SSE event stream of CompletionResponseStreamChunk. content: application/json: schema: @@ -197,11 +203,11 @@ paths: get: responses: '200': - description: A ListAgentsResponse. + description: A PaginatedResponse. content: application/json: schema: - $ref: '#/components/schemas/ListAgentsResponse' + $ref: '#/components/schemas/PaginatedResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -215,7 +221,19 @@ paths: tags: - Agents description: List all agents. - parameters: [] + parameters: + - name: start_index + in: query + description: The index to start the pagination from. + required: false + schema: + type: integer + - name: limit + in: query + description: The number of agents to return. + required: false + schema: + type: integer post: responses: '200': @@ -288,7 +306,7 @@ paths: '200': description: >- If stream=False, returns a Turn object. If stream=True, returns an SSE - event stream of AgentTurnResponseStreamChunk + event stream of AgentTurnResponseStreamChunk. content: application/json: schema: @@ -334,8 +352,7 @@ paths: post: responses: '200': - description: >- - Runtime representation of an annotated type. + description: An OpenAIResponseObject. content: application/json: schema: @@ -367,7 +384,7 @@ paths: get: responses: '200': - description: OK + description: A ListBucketResponse. content: application/json: schema: @@ -388,13 +405,14 @@ paths: parameters: - name: bucket in: query + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string post: responses: '200': - description: OK + description: A FileUploadResponse. content: application/json: schema: @@ -465,7 +483,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Agents - description: Delete an agent by its ID. + description: >- + Delete an agent by its ID and its associated sessions and turns. parameters: - name: agent_id in: path @@ -477,7 +496,7 @@ paths: get: responses: '200': - description: OK + description: A Session. content: application/json: schema: @@ -534,7 +553,8 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Agents - description: Delete an agent session by its ID. + description: >- + Delete an agent session by its ID and its associated turns. parameters: - name: session_id in: path @@ -553,7 +573,7 @@ paths: get: responses: '200': - description: OK + description: A FileResponse. content: application/json: schema: @@ -575,14 +595,14 @@ paths: parameters: - name: bucket in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string - name: key in: path description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). required: true schema: type: string @@ -607,14 +627,14 @@ paths: parameters: - name: bucket in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' required: true schema: type: string - name: key in: path description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). required: true schema: type: string @@ -625,7 +645,7 @@ paths: description: >- An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you - can check model metadata using /models/{model_id} + can check model metadata using /models/{model_id}. content: application/json: schema: @@ -656,7 +676,7 @@ paths: responses: '200': description: >- - EvaluateResponse object containing generations and scores + EvaluateResponse object containing generations and scores. content: application/json: schema: @@ -782,7 +802,7 @@ paths: get: responses: '200': - description: OK + description: A Benchmark. content: application/json: schema: @@ -799,10 +819,40 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Get a benchmark by its ID. parameters: - name: benchmark_id in: path + description: The ID of the benchmark to get. + required: true + schema: + type: string + /v1/openai/v1/chat/completions/{completion_id}: + get: + responses: + '200': + description: A OpenAICompletionWithInputMessages. + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAICompletionWithInputMessages' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: Describe a chat completion by its ID. + parameters: + - name: completion_id + in: path + description: ID of the chat completion. required: true schema: type: string @@ -810,7 +860,7 @@ paths: get: responses: '200': - description: OK + description: A Dataset. content: application/json: schema: @@ -827,10 +877,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: '' + description: Get a dataset by its ID. parameters: - name: dataset_id in: path + description: The ID of the dataset to get. required: true schema: type: string @@ -850,10 +901,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Datasets - description: '' + description: Unregister a dataset by its ID. parameters: - name: dataset_id in: path + description: The ID of the dataset to unregister. required: true schema: type: string @@ -861,7 +913,7 @@ paths: get: responses: '200': - description: OK + description: A Model. content: application/json: schema: @@ -878,10 +930,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: Get a model by its identifier. parameters: - name: model_id in: path + description: The identifier of the model to get. required: true schema: type: string @@ -901,10 +954,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: Unregister a model. parameters: - name: model_id in: path + description: >- + The identifier of the model to unregister. required: true schema: type: string @@ -942,7 +997,7 @@ paths: get: responses: '200': - description: OK + description: A ScoringFn. content: application/json: schema: @@ -959,10 +1014,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ScoringFunctions - description: '' + description: Get a scoring function by its ID. parameters: - name: scoring_fn_id in: path + description: The ID of the scoring function to get. required: true schema: type: string @@ -970,7 +1026,7 @@ paths: get: responses: '200': - description: OK + description: A Shield. content: application/json: schema: @@ -987,10 +1043,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Shields - description: '' + description: Get a shield by its identifier. parameters: - name: identifier in: path + description: The identifier of the shield to get. required: true schema: type: string @@ -998,7 +1055,7 @@ paths: get: responses: '200': - description: OK + description: A Span. content: application/json: schema: @@ -1015,15 +1072,18 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a span by its ID. parameters: - name: trace_id in: path + description: >- + The ID of the trace to get the span from. required: true schema: type: string - name: span_id in: path + description: The ID of the span to get. required: true schema: type: string @@ -1031,7 +1091,7 @@ paths: post: responses: '200': - description: OK + description: A QuerySpanTreeResponse. content: application/json: schema: @@ -1048,10 +1108,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a span tree by its ID. parameters: - name: span_id in: path + description: The ID of the span to get the tree from. required: true schema: type: string @@ -1065,7 +1126,7 @@ paths: get: responses: '200': - description: OK + description: A Tool. content: application/json: schema: @@ -1082,10 +1143,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: '' + description: Get a tool by its name. parameters: - name: tool_name in: path + description: The name of the tool to get. required: true schema: type: string @@ -1093,7 +1155,7 @@ paths: get: responses: '200': - description: OK + description: A ToolGroup. content: application/json: schema: @@ -1110,10 +1172,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: '' + description: Get a tool group by its ID. parameters: - name: toolgroup_id in: path + description: The ID of the tool group to get. required: true schema: type: string @@ -1133,10 +1196,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolGroups - description: Unregister a tool group + description: Unregister a tool group. parameters: - name: toolgroup_id in: path + description: The ID of the tool group to unregister. required: true schema: type: string @@ -1144,7 +1208,7 @@ paths: get: responses: '200': - description: OK + description: A Trace. content: application/json: schema: @@ -1161,10 +1225,11 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Get a trace by its ID. parameters: - name: trace_id in: path + description: The ID of the trace to get. required: true schema: type: string @@ -1172,7 +1237,7 @@ paths: get: responses: '200': - description: OK + description: A PostTrainingJobArtifactsResponse. content: application/json: schema: @@ -1189,10 +1254,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get the artifacts of a training job. parameters: - name: job_uuid in: query + description: >- + The UUID of the job to get the artifacts of. required: true schema: type: string @@ -1200,7 +1267,7 @@ paths: get: responses: '200': - description: OK + description: A PostTrainingJobStatusResponse. content: application/json: schema: @@ -1217,10 +1284,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get the status of a training job. parameters: - name: job_uuid in: query + description: >- + The UUID of the job to get the status of. required: true schema: type: string @@ -1228,7 +1297,7 @@ paths: get: responses: '200': - description: OK + description: A ListPostTrainingJobsResponse. content: application/json: schema: @@ -1245,13 +1314,13 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Get all training jobs. parameters: [] /v1/files/session:{upload_id}: get: responses: '200': - description: OK + description: A FileUploadResponse. content: application/json: schema: @@ -1269,18 +1338,19 @@ paths: tags: - Files description: >- - Returns information about an existsing upload session + Returns information about an existsing upload session. parameters: - name: upload_id in: path - description: ID of the upload session + description: ID of the upload session. required: true schema: type: string post: responses: '200': - description: OK + description: >- + A FileResponse or None if the upload is not complete. content: application/json: schema: @@ -1305,7 +1375,7 @@ paths: parameters: - name: upload_id in: path - description: ID of the upload session + description: ID of the upload session. required: true schema: type: string @@ -1320,7 +1390,7 @@ paths: get: responses: '200': - description: OK + description: A VectorDB. content: application/json: schema: @@ -1337,10 +1407,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorDBs - description: '' + description: Get a vector database by its identifier. parameters: - name: vector_db_id in: path + description: >- + The identifier of the vector database to get. required: true schema: type: string @@ -1360,10 +1432,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorDBs - description: '' + description: Unregister a vector database. parameters: - name: vector_db_id in: path + description: >- + The identifier of the vector database to unregister. required: true schema: type: string @@ -1371,7 +1445,7 @@ paths: get: responses: '200': - description: OK + description: A HealthInfo. content: application/json: schema: @@ -1388,7 +1462,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inspect - description: '' + description: Get the health of the service. parameters: [] /v1/tool-runtime/rag-tool/insert: post: @@ -1433,7 +1507,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorIO - description: '' + description: Insert chunks into a vector database. parameters: [] requestBody: content: @@ -1445,7 +1519,8 @@ paths: get: responses: '200': - description: OK + description: >- + A ProviderInfo object containing the provider's details. content: application/json: schema: @@ -1462,10 +1537,12 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Providers - description: '' + description: >- + Get detailed information about a specific provider. parameters: - name: provider_id in: path + description: The ID of the provider to inspect. required: true schema: type: string @@ -1473,7 +1550,7 @@ paths: post: responses: '200': - description: OK + description: A ToolInvocationResult. content: application/json: schema: @@ -1490,7 +1567,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - ToolRuntime - description: Run a tool with the given arguments + description: Run a tool with the given arguments. parameters: [] requestBody: content: @@ -1502,7 +1579,7 @@ paths: get: responses: '200': - description: OK + description: A PaginatedResponse. content: application/json: schema: @@ -1531,9 +1608,9 @@ paths: The response includes: - - data: List of items for the current page + - data: List of items for the current page. - - has_more: Whether there are more items available after this set + - has_more: Whether there are more items available after this set. parameters: - name: dataset_id in: path @@ -1559,7 +1636,7 @@ paths: get: responses: '200': - description: The status of the evaluationjob. + description: The status of the evaluation job. content: application/json: schema: @@ -1662,11 +1739,11 @@ paths: get: responses: '200': - description: A ListAgentSessionsResponse. + description: A PaginatedResponse. content: application/json: schema: - $ref: '#/components/schemas/ListAgentSessionsResponse' + $ref: '#/components/schemas/PaginatedResponse' '400': $ref: '#/components/responses/BadRequest400' '429': @@ -1688,11 +1765,23 @@ paths: required: true schema: type: string + - name: start_index + in: query + description: The index to start the pagination from. + required: false + schema: + type: integer + - name: limit + in: query + description: The number of sessions to return. + required: false + schema: + type: integer /v1/eval/benchmarks: get: responses: '200': - description: OK + description: A ListBenchmarksResponse. content: application/json: schema: @@ -1709,7 +1798,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: List all benchmarks. parameters: [] post: responses: @@ -1727,7 +1816,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Benchmarks - description: '' + description: Register a benchmark. parameters: [] requestBody: content: @@ -1735,472 +1824,61 @@ paths: schema: $ref: '#/components/schemas/RegisterBenchmarkRequest' required: true - /v1/datasets: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListDatasetsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Datasets - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Dataset' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Datasets - description: Register a new dataset. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterDatasetRequest' - required: true - /v1/files/{bucket}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListFileResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Files - description: List all files in a bucket. - parameters: - - name: bucket - in: path - description: 'Bucket name (valid chars: a-zA-Z0-9_-)' - required: true - schema: - type: string - /v1/models: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListModelsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Models - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Model' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Models - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterModelRequest' - required: true - /v1/providers: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListProvidersResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Providers - description: '' - parameters: [] - /v1/inspect/routes: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListRoutesResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Inspect - description: '' - parameters: [] - /v1/tool-runtime/list-tools: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolDefsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolRuntime - description: '' - parameters: - - name: tool_group_id - in: query - required: false - schema: - type: string - - name: mcp_endpoint - in: query - required: false - schema: - $ref: '#/components/schemas/URL' - /v1/scoring-functions: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListScoringFunctionsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterScoringFunctionRequest' - required: true - /v1/shields: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListShieldsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Shields - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/Shield' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Shields - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterShieldRequest' - required: true - /v1/toolgroups: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolGroupsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: List tool groups with optional provider - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: Register a tool group - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterToolGroupRequest' - required: true - /v1/tools: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListToolsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ToolGroups - description: List tools with optional tool group - parameters: - - name: toolgroup_id - in: query - required: false - schema: - type: string - /v1/vector-dbs: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListVectorDBsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - VectorDBs - description: '' - parameters: [] - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/VectorDB' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - VectorDBs - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterVectorDbRequest' - required: true - /v1/telemetry/events: - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Telemetry - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/LogEventRequest' - required: true /v1/openai/v1/chat/completions: + get: + responses: + '200': + description: A ListOpenAIChatCompletionResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListOpenAIChatCompletionResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: List all chat completions. + parameters: + - name: after + in: query + description: >- + The ID of the last chat completion to return. + required: false + schema: + type: string + - name: limit + in: query + description: >- + The maximum number of chat completions to return. + required: false + schema: + type: integer + - name: model + in: query + description: The model to filter by. + required: false + schema: + type: string + - name: order + in: query + description: >- + The order to sort the chat completions by: "asc" or "desc". Defaults to + "desc". + required: false + schema: + $ref: '#/components/schemas/Order' post: responses: '200': - description: >- - Response from an OpenAI-compatible chat completion request. **OR** Chunk - from a streaming response to an OpenAI-compatible chat completion request. + description: An OpenAIChatCompletion. content: application/json: schema: @@ -2229,11 +1907,477 @@ paths: schema: $ref: '#/components/schemas/OpenaiChatCompletionRequest' required: true - /v1/openai/v1/completions: + /v1/datasets: + get: + responses: + '200': + description: A ListDatasetsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListDatasetsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Datasets + description: List all datasets. + parameters: [] + post: + responses: + '200': + description: A Dataset. + content: + application/json: + schema: + $ref: '#/components/schemas/Dataset' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Datasets + description: Register a new dataset. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterDatasetRequest' + required: true + /v1/files/{bucket}: + get: + responses: + '200': + description: A ListFileResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListFileResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Files + description: List all files in a bucket. + parameters: + - name: bucket + in: path + description: 'Bucket name (valid chars: a-zA-Z0-9_-).' + required: true + schema: + type: string + /v1/models: + get: + responses: + '200': + description: A ListModelsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListModelsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Models + description: List all models. + parameters: [] + post: + responses: + '200': + description: A Model. + content: + application/json: + schema: + $ref: '#/components/schemas/Model' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Models + description: Register a model. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterModelRequest' + required: true + /v1/providers: + get: + responses: + '200': + description: >- + A ListProvidersResponse containing information about all providers. + content: + application/json: + schema: + $ref: '#/components/schemas/ListProvidersResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Providers + description: List all available providers. + parameters: [] + /v1/inspect/routes: + get: + responses: + '200': + description: A ListRoutesResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListRoutesResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inspect + description: List all routes. + parameters: [] + /v1/tool-runtime/list-tools: + get: + responses: + '200': + description: A ListToolDefsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolDefsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolRuntime + description: List all tools in the runtime. + parameters: + - name: tool_group_id + in: query + description: >- + The ID of the tool group to list tools for. + required: false + schema: + type: string + - name: mcp_endpoint + in: query + description: >- + The MCP endpoint to use for the tool group. + required: false + schema: + $ref: '#/components/schemas/URL' + /v1/scoring-functions: + get: + responses: + '200': + description: A ListScoringFunctionsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListScoringFunctionsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: List all scoring functions. + parameters: [] post: responses: '200': description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ScoringFunctions + description: Register a scoring function. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterScoringFunctionRequest' + required: true + /v1/shields: + get: + responses: + '200': + description: A ListShieldsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListShieldsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Shields + description: List all shields. + parameters: [] + post: + responses: + '200': + description: A Shield. + content: + application/json: + schema: + $ref: '#/components/schemas/Shield' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Shields + description: Register a shield. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterShieldRequest' + required: true + /v1/toolgroups: + get: + responses: + '200': + description: A ListToolGroupsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolGroupsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: List tool groups with optional provider. + parameters: [] + post: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: Register a tool group. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterToolGroupRequest' + required: true + /v1/tools: + get: + responses: + '200': + description: A ListToolsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListToolsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - ToolGroups + description: List tools with optional tool group. + parameters: + - name: toolgroup_id + in: query + description: >- + The ID of the tool group to list tools for. + required: false + schema: + type: string + /v1/vector-dbs: + get: + responses: + '200': + description: A ListVectorDBsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListVectorDBsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - VectorDBs + description: List all vector databases. + parameters: [] + post: + responses: + '200': + description: A VectorDB. + content: + application/json: + schema: + $ref: '#/components/schemas/VectorDB' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - VectorDBs + description: Register a vector database. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RegisterVectorDbRequest' + required: true + /v1/telemetry/events: + post: + responses: + '200': + description: OK + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Telemetry + description: Log an event. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/LogEventRequest' + required: true + /v1/openai/v1/completions: + post: + responses: + '200': + description: An OpenAICompletion. content: application/json: schema: @@ -2264,7 +2408,7 @@ paths: get: responses: '200': - description: OK + description: A OpenAIListModelsResponse. content: application/json: schema: @@ -2281,13 +2425,13 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Models - description: '' + description: List models using the OpenAI API. parameters: [] /v1/post-training/preference-optimize: post: responses: '200': - description: OK + description: A PostTrainingJob. content: application/json: schema: @@ -2304,7 +2448,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Run preference optimization of a model. parameters: [] requestBody: content: @@ -2346,7 +2490,7 @@ paths: post: responses: '200': - description: OK + description: A QueryChunksResponse. content: application/json: schema: @@ -2363,7 +2507,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - VectorIO - description: '' + description: Query chunks from a vector database. parameters: [] requestBody: content: @@ -2371,11 +2515,46 @@ paths: schema: $ref: '#/components/schemas/QueryChunksRequest' required: true + /v1/telemetry/metrics/{metric_name}: + post: + responses: + '200': + description: A QueryMetricsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Telemetry + description: Query metrics. + parameters: + - name: metric_name + in: path + description: The name of the metric to query. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsRequest' + required: true /v1/telemetry/spans: post: responses: '200': - description: OK + description: A QuerySpansResponse. content: application/json: schema: @@ -2392,7 +2571,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Query spans. parameters: [] requestBody: content: @@ -2404,7 +2583,7 @@ paths: post: responses: '200': - description: OK + description: A QueryTracesResponse. content: application/json: schema: @@ -2421,7 +2600,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Query traces. parameters: [] requestBody: content: @@ -2527,7 +2706,7 @@ paths: post: responses: '200': - description: OK + description: A RunShieldResponse. content: application/json: schema: @@ -2544,7 +2723,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Safety - description: '' + description: Run a shield. parameters: [] requestBody: content: @@ -2569,7 +2748,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Telemetry - description: '' + description: Save spans to a dataset. parameters: [] requestBody: content: @@ -2582,7 +2761,7 @@ paths: responses: '200': description: >- - ScoreResponse object containing rows and aggregated results + A ScoreResponse object containing rows and aggregated results. content: application/json: schema: @@ -2611,7 +2790,7 @@ paths: post: responses: '200': - description: OK + description: A ScoreBatchResponse. content: application/json: schema: @@ -2628,7 +2807,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Scoring - description: '' + description: Score a batch of rows. parameters: [] requestBody: content: @@ -2640,7 +2819,7 @@ paths: post: responses: '200': - description: OK + description: A PostTrainingJob. content: application/json: schema: @@ -2657,7 +2836,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - PostTraining (Coming Soon) - description: '' + description: Run supervised fine-tuning of a model. parameters: [] requestBody: content: @@ -2698,7 +2877,7 @@ paths: get: responses: '200': - description: OK + description: A VersionInfo. content: application/json: schema: @@ -2715,7 +2894,7 @@ paths: $ref: '#/components/responses/DefaultError' tags: - Inspect - description: '' + description: Get the version of the service. parameters: [] jsonSchemaDialect: >- https://json-schema.org/draft/2020-12/schema @@ -2764,6 +2943,7 @@ components: - type: string - type: array - type: object + description: The rows to append to the dataset. additionalProperties: false required: - rows @@ -2812,10 +2992,13 @@ components: properties: type: type: string - const: grammar - default: grammar + enum: + - json_schema + - grammar description: >- Must be "grammar" to identify this format type + const: grammar + default: grammar bnf: type: object additionalProperties: @@ -2897,10 +3080,13 @@ components: properties: type: type: string - const: json_schema - default: json_schema + enum: + - json_schema + - grammar description: >- Must be "json_schema" to identify this format type + const: json_schema + default: json_schema json_schema: type: object additionalProperties: @@ -3262,22 +3448,34 @@ components: properties: model_id: type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. messages_batch: type: array items: type: array items: $ref: '#/components/schemas/Message' + description: >- + The messages to generate completions for. sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy. tools: type: array items: $ref: '#/components/schemas/ToolDefinition' + description: >- + (Optional) List of tool definitions available to the model. tool_config: $ref: '#/components/schemas/ToolConfig' + description: (Optional) Configuration for tool use. response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding. logprobs: type: object properties: @@ -3287,7 +3485,9 @@ components: description: >- How many tokens (for each position) to return log probabilities for. additionalProperties: false - title: LogProbConfig + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id @@ -3360,14 +3560,22 @@ components: properties: model_id: type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. content_batch: type: array items: $ref: '#/components/schemas/InterleavedContent' + description: The content to generate completions for. sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy. response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding. logprobs: type: object properties: @@ -3377,7 +3585,9 @@ components: description: >- How many tokens (for each position) to return log probabilities for. additionalProperties: false - title: LogProbConfig + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id @@ -3428,6 +3638,7 @@ components: properties: job_uuid: type: string + description: The UUID of the job to cancel. additionalProperties: false required: - job_uuid @@ -3444,17 +3655,17 @@ components: type: array items: $ref: '#/components/schemas/Message' - description: List of messages in the conversation + description: List of messages in the conversation. sampling_params: $ref: '#/components/schemas/SamplingParams' description: >- - Parameters to control the sampling strategy + Parameters to control the sampling strategy. tools: type: array items: $ref: '#/components/schemas/ToolDefinition' description: >- - (Optional) List of tool definitions available to the model + (Optional) List of tool definitions available to the model. tool_choice: type: string enum: @@ -3637,15 +3848,16 @@ components: Llama Stack and available via the /models endpoint. content: $ref: '#/components/schemas/InterleavedContent' - description: The content to generate a completion for + description: >- + The content to generate a completion for. sampling_params: $ref: '#/components/schemas/SamplingParams' description: >- - (Optional) Parameters to control the sampling strategy + (Optional) Parameters to control the sampling strategy. response_format: $ref: '#/components/schemas/ResponseFormat' description: >- - (Optional) Grammar specification for guided (structured) decoding + (Optional) Grammar specification for guided (structured) decoding. stream: type: boolean description: >- @@ -3959,6 +4171,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: inference default: inference model_response: @@ -3991,6 +4210,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval vector_db_ids: @@ -4052,6 +4278,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: shield_call default: shield_call violation: @@ -4083,6 +4316,13 @@ components: description: The time the step completed. step_type: type: string + enum: + - inference + - tool_execution + - shield_call + - memory_retrieval + title: StepType + description: Type of the step in an agent turn. const: tool_execution default: tool_execution tool_calls: @@ -4245,6 +4485,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_complete default: step_complete step_type: @@ -4283,6 +4531,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_progress default: step_progress step_type: @@ -4310,6 +4566,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: step_start default: step_start step_type: @@ -4354,6 +4618,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_awaiting_input default: turn_awaiting_input turn: @@ -4369,6 +4641,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_complete default: turn_complete turn: @@ -4383,6 +4663,14 @@ components: properties: event_type: type: string + enum: + - step_start + - step_complete + - step_progress + - turn_start + - turn_complete + - turn_awaiting_input + title: AgentTurnResponseEventType const: turn_start default: turn_start turn_id: @@ -4392,34 +4680,37 @@ components: - event_type - turn_id title: AgentTurnResponseTurnStartPayload - OpenAIResponseInputMessage: + OpenAIResponseInput: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' + - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' + - $ref: '#/components/schemas/OpenAIResponseMessage' + "OpenAIResponseInputFunctionToolCallOutput": type: object properties: - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIResponseInputMessageContent' - role: - oneOf: - - type: string - const: system - - type: string - const: developer - - type: string - const: user - - type: string - const: assistant + call_id: + type: string + output: + type: string type: type: string - const: message - default: message + const: function_call_output + default: function_call_output + id: + type: string + status: + type: string additionalProperties: false required: - - content - - role - title: OpenAIResponseInputMessage + - call_id + - output + - type + title: >- + OpenAIResponseInputFunctionToolCallOutput + description: >- + This represents the output of a function call that gets passed back to the + model. OpenAIResponseInputMessageContent: oneOf: - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText' @@ -4467,6 +4758,71 @@ components: - type title: OpenAIResponseInputMessageContentText OpenAIResponseInputTool: + oneOf: + - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' + - $ref: '#/components/schemas/OpenAIResponseInputToolFileSearch' + - $ref: '#/components/schemas/OpenAIResponseInputToolFunction' + discriminator: + propertyName: type + mapping: + web_search: '#/components/schemas/OpenAIResponseInputToolWebSearch' + file_search: '#/components/schemas/OpenAIResponseInputToolFileSearch' + function: '#/components/schemas/OpenAIResponseInputToolFunction' + OpenAIResponseInputToolFileSearch: + type: object + properties: + type: + type: string + const: file_search + default: file_search + vector_store_id: + type: array + items: + type: string + ranking_options: + type: object + properties: + ranker: + type: string + score_threshold: + type: number + default: 0.0 + additionalProperties: false + title: FileSearchRankingOptions + additionalProperties: false + required: + - type + - vector_store_id + title: OpenAIResponseInputToolFileSearch + OpenAIResponseInputToolFunction: + type: object + properties: + type: + type: string + const: function + default: function + name: + type: string + description: + type: string + parameters: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + strict: + type: boolean + additionalProperties: false + required: + - type + - name + title: OpenAIResponseInputToolFunction + OpenAIResponseInputToolWebSearch: type: object properties: type: @@ -4483,6 +4839,106 @@ components: required: - type title: OpenAIResponseInputToolWebSearch + OpenAIResponseMessage: + type: object + properties: + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIResponseInputMessageContent' + - type: array + items: + $ref: '#/components/schemas/OpenAIResponseOutputMessageContent' + role: + oneOf: + - type: string + const: system + - type: string + const: developer + - type: string + const: user + - type: string + const: assistant + type: + type: string + const: message + default: message + id: + type: string + status: + type: string + additionalProperties: false + required: + - content + - role + - type + title: OpenAIResponseMessage + description: >- + Corresponds to the various Message types in the Responses API. They are all + under one type because the Responses API gives them all the same "type" value, + and there is no way to tell them apart in certain scenarios. + OpenAIResponseOutputMessageContent: + type: object + properties: + text: + type: string + type: + type: string + const: output_text + default: output_text + additionalProperties: false + required: + - text + - type + title: >- + OpenAIResponseOutputMessageContentOutputText + "OpenAIResponseOutputMessageFunctionToolCall": + type: object + properties: + arguments: + type: string + call_id: + type: string + name: + type: string + type: + type: string + const: function_call + default: function_call + id: + type: string + status: + type: string + additionalProperties: false + required: + - arguments + - call_id + - name + - type + - id + - status + title: >- + OpenAIResponseOutputMessageFunctionToolCall + "OpenAIResponseOutputMessageWebSearchToolCall": + type: object + properties: + id: + type: string + status: + type: string + type: + type: string + const: web_search_call + default: web_search_call + additionalProperties: false + required: + - id + - status + - type + title: >- + OpenAIResponseOutputMessageWebSearchToolCall CreateOpenaiResponseRequest: type: object properties: @@ -4491,7 +4947,7 @@ components: - type: string - type: array items: - $ref: '#/components/schemas/OpenAIResponseInputMessage' + $ref: '#/components/schemas/OpenAIResponseInput' description: Input message(s) to create the response. model: type: string @@ -4575,73 +5031,15 @@ components: title: OpenAIResponseObject OpenAIResponseOutput: oneOf: - - $ref: '#/components/schemas/OpenAIResponseOutputMessage' + - $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' discriminator: propertyName: type mapping: - message: '#/components/schemas/OpenAIResponseOutputMessage' + message: '#/components/schemas/OpenAIResponseMessage' web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' - OpenAIResponseOutputMessage: - type: object - properties: - id: - type: string - content: - type: array - items: - $ref: '#/components/schemas/OpenAIResponseOutputMessageContent' - role: - type: string - const: assistant - default: assistant - status: - type: string - type: - type: string - const: message - default: message - additionalProperties: false - required: - - id - - content - - role - - status - - type - title: OpenAIResponseOutputMessage - OpenAIResponseOutputMessageContent: - type: object - properties: - text: - type: string - type: - type: string - const: output_text - default: output_text - additionalProperties: false - required: - - text - - type - title: >- - OpenAIResponseOutputMessageContentOutputText - "OpenAIResponseOutputMessageWebSearchToolCall": - type: object - properties: - id: - type: string - status: - type: string - type: - type: string - const: web_search_call - default: web_search_call - additionalProperties: false - required: - - id - - status - - type - title: >- - OpenAIResponseOutputMessageWebSearchToolCall + function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' OpenAIResponseObjectStream: oneOf: - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated' @@ -4687,17 +5085,17 @@ components: bucket: type: string description: >- - Bucket under which the file is stored (valid chars: a-zA-Z0-9_-) + Bucket under which the file is stored (valid chars: a-zA-Z0-9_-). key: type: string description: >- - Key under which the file is stored (valid chars: a-zA-Z0-9_-/.) + Key under which the file is stored (valid chars: a-zA-Z0-9_-/.). mime_type: type: string - description: MIME type of the file + description: MIME type of the file. size: type: integer - description: File size in bytes + description: File size in bytes. additionalProperties: false required: - bucket @@ -4825,7 +5223,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: basic default: basic aggregation_functions: @@ -4835,6 +5233,7 @@ components: additionalProperties: false required: - type + - aggregation_functions title: BasicScoringFnParams BenchmarkConfig: type: object @@ -4874,7 +5273,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: llm_as_judge default: llm_as_judge judge_model: @@ -4893,6 +5292,8 @@ components: required: - type - judge_model + - judge_score_regexes + - aggregation_functions title: LLMAsJudgeScoringFnParams ModelCandidate: type: object @@ -4923,7 +5324,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/ScoringFnParamsType' const: regex_parser default: regex_parser parsing_regexes: @@ -4937,6 +5338,8 @@ components: additionalProperties: false required: - type + - parsing_regexes + - aggregation_functions title: RegexParserScoringFnParams ScoringFnParams: oneOf: @@ -4949,6 +5352,13 @@ components: llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' regex_parser: '#/components/schemas/RegexParserScoringFnParams' basic: '#/components/schemas/BasicScoringFnParams' + ScoringFnParamsType: + type: string + enum: + - llm_as_judge + - regex_parser + - basic + title: ScoringFnParamsType EvaluateRowsRequest: type: object properties: @@ -5111,6 +5521,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: benchmark default: benchmark dataset_id: @@ -5132,13 +5552,375 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - dataset_id - scoring_functions - metadata title: Benchmark + OpenAIAssistantMessageParam: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The content of the model's response + name: + type: string + description: >- + (Optional) The name of the assistant message participant. + tool_calls: + type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionToolCall' + description: >- + List of tool calls. Each tool call is an OpenAIChatCompletionToolCall + object. + additionalProperties: false + required: + - role + title: OpenAIAssistantMessageParam + description: >- + A message containing the model's (assistant) response in an OpenAI-compatible + chat completion request. + "OpenAIChatCompletionContentPartImageParam": + type: object + properties: + type: + type: string + const: image_url + default: image_url + image_url: + $ref: '#/components/schemas/OpenAIImageURL' + additionalProperties: false + required: + - type + - image_url + title: >- + OpenAIChatCompletionContentPartImageParam + OpenAIChatCompletionContentPartParam: + oneOf: + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + discriminator: + propertyName: type + mapping: + text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' + image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' + OpenAIChatCompletionContentPartTextParam: + type: object + properties: + type: + type: string + const: text + default: text + text: + type: string + additionalProperties: false + required: + - type + - text + title: OpenAIChatCompletionContentPartTextParam + OpenAIChatCompletionToolCall: + type: object + properties: + index: + type: integer + id: + type: string + type: + type: string + const: function + default: function + function: + $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction' + additionalProperties: false + required: + - type + title: OpenAIChatCompletionToolCall + OpenAIChatCompletionToolCallFunction: + type: object + properties: + name: + type: string + arguments: + type: string + additionalProperties: false + title: OpenAIChatCompletionToolCallFunction + OpenAIChoice: + type: object + properties: + message: + $ref: '#/components/schemas/OpenAIMessageParam' + description: The message from the model + finish_reason: + type: string + description: The reason the model stopped generating + index: + type: integer + description: The index of the choice + logprobs: + $ref: '#/components/schemas/OpenAIChoiceLogprobs' + description: >- + (Optional) The log probabilities for the tokens in the message + additionalProperties: false + required: + - message + - finish_reason + - index + title: OpenAIChoice + description: >- + A choice from an OpenAI-compatible chat completion response. + OpenAIChoiceLogprobs: + type: object + properties: + content: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message + refusal: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + description: >- + (Optional) The log probabilities for the tokens in the message + additionalProperties: false + title: OpenAIChoiceLogprobs + description: >- + The log probabilities for the tokens in the message from an OpenAI-compatible + chat completion response. + OpenAIDeveloperMessageParam: + type: object + properties: + role: + type: string + const: developer + default: developer + description: >- + Must be "developer" to identify this as a developer message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The content of the developer message + name: + type: string + description: >- + (Optional) The name of the developer message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIDeveloperMessageParam + description: >- + A message from the developer in an OpenAI-compatible chat completion request. + OpenAIImageURL: + type: object + properties: + url: + type: string + detail: + type: string + additionalProperties: false + required: + - url + title: OpenAIImageURL + OpenAIMessageParam: + oneOf: + - $ref: '#/components/schemas/OpenAIUserMessageParam' + - $ref: '#/components/schemas/OpenAISystemMessageParam' + - $ref: '#/components/schemas/OpenAIAssistantMessageParam' + - $ref: '#/components/schemas/OpenAIToolMessageParam' + - $ref: '#/components/schemas/OpenAIDeveloperMessageParam' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/OpenAIUserMessageParam' + system: '#/components/schemas/OpenAISystemMessageParam' + assistant: '#/components/schemas/OpenAIAssistantMessageParam' + tool: '#/components/schemas/OpenAIToolMessageParam' + developer: '#/components/schemas/OpenAIDeveloperMessageParam' + OpenAISystemMessageParam: + type: object + properties: + role: + type: string + const: system + default: system + description: >- + Must be "system" to identify this as a system message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: >- + The content of the "system prompt". If multiple system messages are provided, + they are concatenated. The underlying Llama Stack code may also add other + system messages (for example, for formatting tool definitions). + name: + type: string + description: >- + (Optional) The name of the system message participant. + additionalProperties: false + required: + - role + - content + title: OpenAISystemMessageParam + description: >- + A system message providing instructions or context to the model. + OpenAITokenLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + top_logprobs: + type: array + items: + $ref: '#/components/schemas/OpenAITopLogProb' + additionalProperties: false + required: + - token + - logprob + - top_logprobs + title: OpenAITokenLogProb + description: >- + The log probability for a token from an OpenAI-compatible chat completion + response. + OpenAIToolMessageParam: + type: object + properties: + role: + type: string + const: tool + default: tool + description: >- + Must be "tool" to identify this as a tool response + tool_call_id: + type: string + description: >- + Unique identifier for the tool call this response is for + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: The response content from the tool + additionalProperties: false + required: + - role + - tool_call_id + - content + title: OpenAIToolMessageParam + description: >- + A message representing the result of a tool invocation in an OpenAI-compatible + chat completion request. + OpenAITopLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + additionalProperties: false + required: + - token + - logprob + title: OpenAITopLogProb + description: >- + The top log probability for a token from an OpenAI-compatible chat completion + response. + OpenAIUserMessageParam: + type: object + properties: + role: + type: string + const: user + default: user + description: >- + Must be "user" to identify this as a user message + content: + oneOf: + - type: string + - type: array + items: + $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' + description: >- + The content of the message, which can include text and other media + name: + type: string + description: >- + (Optional) The name of the user message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIUserMessageParam + description: >- + A message from the user in an OpenAI-compatible chat completion request. + OpenAICompletionWithInputMessages: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChoice' + description: List of choices + object: + type: string + const: chat.completion + default: chat.completion + description: >- + The object type, which will be "chat.completion" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + input_messages: + type: array + items: + $ref: '#/components/schemas/OpenAIMessageParam' + additionalProperties: false + required: + - id + - choices + - object + - created + - model + - input_messages + title: OpenAICompletionWithInputMessages DataSource: oneOf: - $ref: '#/components/schemas/URIDataSource' @@ -5159,6 +5941,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: dataset default: dataset purpose: @@ -5185,7 +5977,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - purpose @@ -5284,6 +6075,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: model default: model metadata: @@ -5302,7 +6103,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - metadata @@ -5438,6 +6238,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: scoring_function default: scoring_function description: @@ -5459,7 +6269,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - metadata @@ -5498,6 +6307,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: shield default: shield params: @@ -5513,7 +6332,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type title: Shield @@ -5560,8 +6378,10 @@ components: type: array items: type: string + description: The attributes to return in the tree. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false title: GetSpanTreeRequest SpanStatus: @@ -5628,6 +6448,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: tool default: tool toolgroup_id: @@ -5653,7 +6483,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - toolgroup_id @@ -5679,6 +6508,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: tool_group default: tool_group mcp_endpoint: @@ -5696,7 +6535,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type title: ToolGroup @@ -5810,6 +6648,16 @@ components: type: string type: type: string + enum: + - model + - shield + - vector_db + - dataset + - scoring_function + - benchmark + - tool + - tool_group + title: ResourceType const: vector_db default: vector_db embedding_model: @@ -5819,7 +6667,6 @@ components: additionalProperties: false required: - identifier - - provider_resource_id - provider_id - type - embedding_model @@ -5898,6 +6745,8 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to insert the chunks into. chunks: type: array items: @@ -5920,8 +6769,10 @@ components: - content - metadata title: Chunk + description: The chunks to insert. ttl_seconds: type: integer + description: The time to live of the chunks. additionalProperties: false required: - vector_db_id @@ -5969,6 +6820,7 @@ components: properties: tool_name: type: string + description: The name of the tool to invoke. kwargs: type: object additionalProperties: @@ -5979,6 +6831,8 @@ components: - type: string - type: array - type: object + description: >- + A dictionary of arguments to pass to the tool. additionalProperties: false required: - tool_name @@ -6051,28 +6905,6 @@ components: - job_id - status title: Job - ListAgentSessionsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/Session' - additionalProperties: false - required: - - data - title: ListAgentSessionsResponse - ListAgentsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/Agent' - additionalProperties: false - required: - - data - title: ListAgentsResponse BucketResponse: type: object properties: @@ -6107,6 +6939,73 @@ components: required: - data title: ListBenchmarksResponse + Order: + type: string + enum: + - asc + - desc + title: Order + ListOpenAIChatCompletionResponse: + type: object + properties: + data: + type: array + items: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChoice' + description: List of choices + object: + type: string + const: chat.completion + default: chat.completion + description: >- + The object type, which will be "chat.completion" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + input_messages: + type: array + items: + $ref: '#/components/schemas/OpenAIMessageParam' + additionalProperties: false + required: + - id + - choices + - object + - created + - model + - input_messages + title: OpenAICompletionWithInputMessages + has_more: + type: boolean + first_id: + type: string + last_id: + type: string + object: + type: string + const: list + default: list + additionalProperties: false + required: + - data + - has_more + - first_id + - last_id + - object + title: ListOpenAIChatCompletionResponse ListDatasetsResponse: type: object properties: @@ -6259,6 +7158,13 @@ components: unstructured_log: '#/components/schemas/UnstructuredLogEvent' metric: '#/components/schemas/MetricEvent' structured_log: '#/components/schemas/StructuredLogEvent' + EventType: + type: string + enum: + - unstructured_log + - structured_log + - metric + title: EventType LogSeverity: type: string enum: @@ -6289,7 +7195,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: metric default: metric metric: @@ -6314,7 +7220,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/StructuredLogType' const: span_end default: span_end status: @@ -6328,7 +7234,7 @@ components: type: object properties: type: - type: string + $ref: '#/components/schemas/StructuredLogType' const: span_start default: span_start name: @@ -6360,7 +7266,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: structured_log default: structured_log payload: @@ -6382,6 +7288,12 @@ components: mapping: span_start: '#/components/schemas/SpanStartPayload' span_end: '#/components/schemas/SpanEndPayload' + StructuredLogType: + type: string + enum: + - span_start + - span_end + title: StructuredLogType UnstructuredLogEvent: type: object properties: @@ -6402,7 +7314,7 @@ components: - type: boolean - type: 'null' type: - type: string + $ref: '#/components/schemas/EventType' const: unstructured_log default: unstructured_log message: @@ -6423,149 +7335,15 @@ components: properties: event: $ref: '#/components/schemas/Event' + description: The event to log. ttl_seconds: type: integer + description: The time to live of the event. additionalProperties: false required: - event - ttl_seconds title: LogEventRequest - OpenAIAssistantMessageParam: - type: object - properties: - role: - type: string - const: assistant - default: assistant - description: >- - Must be "assistant" to identify this as the model's response - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The content of the model's response - name: - type: string - description: >- - (Optional) The name of the assistant message participant. - tool_calls: - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionToolCall' - description: >- - List of tool calls. Each tool call is an OpenAIChatCompletionToolCall - object. - additionalProperties: false - required: - - role - title: OpenAIAssistantMessageParam - description: >- - A message containing the model's (assistant) response in an OpenAI-compatible - chat completion request. - "OpenAIChatCompletionContentPartImageParam": - type: object - properties: - type: - type: string - const: image_url - default: image_url - image_url: - $ref: '#/components/schemas/OpenAIImageURL' - additionalProperties: false - required: - - type - - image_url - title: >- - OpenAIChatCompletionContentPartImageParam - OpenAIChatCompletionContentPartParam: - oneOf: - - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' - - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' - discriminator: - propertyName: type - mapping: - text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam' - image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam' - OpenAIChatCompletionContentPartTextParam: - type: object - properties: - type: - type: string - const: text - default: text - text: - type: string - additionalProperties: false - required: - - type - - text - title: OpenAIChatCompletionContentPartTextParam - OpenAIChatCompletionToolCall: - type: object - properties: - index: - type: integer - id: - type: string - type: - type: string - const: function - default: function - function: - $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction' - additionalProperties: false - required: - - type - title: OpenAIChatCompletionToolCall - OpenAIChatCompletionToolCallFunction: - type: object - properties: - name: - type: string - arguments: - type: string - additionalProperties: false - title: OpenAIChatCompletionToolCallFunction - OpenAIDeveloperMessageParam: - type: object - properties: - role: - type: string - const: developer - default: developer - description: >- - Must be "developer" to identify this as a developer message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The content of the developer message - name: - type: string - description: >- - (Optional) The name of the developer message participant. - additionalProperties: false - required: - - role - - content - title: OpenAIDeveloperMessageParam - description: >- - A message from the developer in an OpenAI-compatible chat completion request. - OpenAIImageURL: - type: object - properties: - url: - type: string - detail: - type: string - additionalProperties: false - required: - - url - title: OpenAIImageURL OpenAIJSONSchema: type: object properties: @@ -6589,21 +7367,6 @@ components: required: - name title: OpenAIJSONSchema - OpenAIMessageParam: - oneOf: - - $ref: '#/components/schemas/OpenAIUserMessageParam' - - $ref: '#/components/schemas/OpenAISystemMessageParam' - - $ref: '#/components/schemas/OpenAIAssistantMessageParam' - - $ref: '#/components/schemas/OpenAIToolMessageParam' - - $ref: '#/components/schemas/OpenAIDeveloperMessageParam' - discriminator: - propertyName: role - mapping: - user: '#/components/schemas/OpenAIUserMessageParam' - system: '#/components/schemas/OpenAISystemMessageParam' - assistant: '#/components/schemas/OpenAIAssistantMessageParam' - tool: '#/components/schemas/OpenAIToolMessageParam' - developer: '#/components/schemas/OpenAIDeveloperMessageParam' OpenAIResponseFormatJSONObject: type: object properties: @@ -6651,93 +7414,6 @@ components: required: - type title: OpenAIResponseFormatText - OpenAISystemMessageParam: - type: object - properties: - role: - type: string - const: system - default: system - description: >- - Must be "system" to identify this as a system message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: >- - The content of the "system prompt". If multiple system messages are provided, - they are concatenated. The underlying Llama Stack code may also add other - system messages (for example, for formatting tool definitions). - name: - type: string - description: >- - (Optional) The name of the system message participant. - additionalProperties: false - required: - - role - - content - title: OpenAISystemMessageParam - description: >- - A system message providing instructions or context to the model. - OpenAIToolMessageParam: - type: object - properties: - role: - type: string - const: tool - default: tool - description: >- - Must be "tool" to identify this as a tool response - tool_call_id: - type: string - description: >- - Unique identifier for the tool call this response is for - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: The response content from the tool - additionalProperties: false - required: - - role - - tool_call_id - - content - title: OpenAIToolMessageParam - description: >- - A message representing the result of a tool invocation in an OpenAI-compatible - chat completion request. - OpenAIUserMessageParam: - type: object - properties: - role: - type: string - const: user - default: user - description: >- - Must be "user" to identify this as a user message - content: - oneOf: - - type: string - - type: array - items: - $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam' - description: >- - The content of the message, which can include text and other media - name: - type: string - description: >- - (Optional) The name of the user message participant. - additionalProperties: false - required: - - role - - content - title: OpenAIUserMessageParam - description: >- - A message from the user in an OpenAI-compatible chat completion request. OpenaiChatCompletionRequest: type: object properties: @@ -6750,11 +7426,11 @@ components: type: array items: $ref: '#/components/schemas/OpenAIMessageParam' - description: List of messages in the conversation + description: List of messages in the conversation. frequency_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. function_call: oneOf: - type: string @@ -6767,7 +7443,7 @@ components: - type: string - type: array - type: object - description: (Optional) The function call to use + description: (Optional) The function call to use. functions: type: array items: @@ -6780,52 +7456,52 @@ components: - type: string - type: array - type: object - description: (Optional) List of functions to use + description: (Optional) List of functions to use. logit_bias: type: object additionalProperties: type: number - description: (Optional) The logit bias to use + description: (Optional) The logit bias to use. logprobs: type: boolean - description: (Optional) The log probabilities to use + description: (Optional) The log probabilities to use. max_completion_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. max_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. n: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. parallel_tool_calls: type: boolean description: >- - (Optional) Whether to parallelize tool calls + (Optional) Whether to parallelize tool calls. presence_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. response_format: $ref: '#/components/schemas/OpenAIResponseFormatParam' - description: (Optional) The response format to use + description: (Optional) The response format to use. seed: type: integer - description: (Optional) The seed to use + description: (Optional) The seed to use. stop: oneOf: - type: string - type: array items: type: string - description: (Optional) The stop tokens to use + description: (Optional) The stop tokens to use. stream: type: boolean description: >- - (Optional) Whether to stream the response + (Optional) Whether to stream the response. stream_options: type: object additionalProperties: @@ -6836,10 +7512,10 @@ components: - type: string - type: array - type: object - description: (Optional) The stream options to use + description: (Optional) The stream options to use. temperature: type: number - description: (Optional) The temperature to use + description: (Optional) The temperature to use. tool_choice: oneOf: - type: string @@ -6852,7 +7528,7 @@ components: - type: string - type: array - type: object - description: (Optional) The tool choice to use + description: (Optional) The tool choice to use. tools: type: array items: @@ -6865,17 +7541,17 @@ components: - type: string - type: array - type: object - description: (Optional) The tools to use + description: (Optional) The tools to use. top_logprobs: type: integer description: >- - (Optional) The top log probabilities to use + (Optional) The top log probabilities to use. top_p: type: number - description: (Optional) The top p to use + description: (Optional) The top p to use. user: type: string - description: (Optional) The user to use + description: (Optional) The user to use. additionalProperties: false required: - model @@ -6951,30 +7627,6 @@ components: title: OpenAIChatCompletionChunk description: >- Chunk from a streaming response to an OpenAI-compatible chat completion request. - OpenAIChoice: - type: object - properties: - message: - $ref: '#/components/schemas/OpenAIMessageParam' - description: The message from the model - finish_reason: - type: string - description: The reason the model stopped generating - index: - type: integer - description: The index of the choice - logprobs: - $ref: '#/components/schemas/OpenAIChoiceLogprobs' - description: >- - (Optional) The log probabilities for the tokens in the message - additionalProperties: false - required: - - message - - finish_reason - - index - title: OpenAIChoice - description: >- - A choice from an OpenAI-compatible chat completion response. OpenAIChoiceDelta: type: object properties: @@ -6996,26 +7648,6 @@ components: title: OpenAIChoiceDelta description: >- A delta from an OpenAI-compatible chat completion streaming response. - OpenAIChoiceLogprobs: - type: object - properties: - content: - type: array - items: - $ref: '#/components/schemas/OpenAITokenLogProb' - description: >- - (Optional) The log probabilities for the tokens in the message - refusal: - type: array - items: - $ref: '#/components/schemas/OpenAITokenLogProb' - description: >- - (Optional) The log probabilities for the tokens in the message - additionalProperties: false - title: OpenAIChoiceLogprobs - description: >- - The log probabilities for the tokens in the message from an OpenAI-compatible - chat completion response. OpenAIChunkChoice: type: object properties: @@ -7040,49 +7672,6 @@ components: title: OpenAIChunkChoice description: >- A chunk choice from an OpenAI-compatible chat completion streaming response. - OpenAITokenLogProb: - type: object - properties: - token: - type: string - bytes: - type: array - items: - type: integer - logprob: - type: number - top_logprobs: - type: array - items: - $ref: '#/components/schemas/OpenAITopLogProb' - additionalProperties: false - required: - - token - - logprob - - top_logprobs - title: OpenAITokenLogProb - description: >- - The log probability for a token from an OpenAI-compatible chat completion - response. - OpenAITopLogProb: - type: object - properties: - token: - type: string - bytes: - type: array - items: - type: integer - logprob: - type: number - additionalProperties: false - required: - - token - - logprob - title: OpenAITopLogProb - description: >- - The top log probability for a token from an OpenAI-compatible chat completion - response. OpenaiCompletionRequest: type: object properties: @@ -7105,52 +7694,52 @@ components: type: array items: type: integer - description: The prompt to generate a completion for + description: The prompt to generate a completion for. best_of: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. echo: type: boolean - description: (Optional) Whether to echo the prompt + description: (Optional) Whether to echo the prompt. frequency_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. logit_bias: type: object additionalProperties: type: number - description: (Optional) The logit bias to use + description: (Optional) The logit bias to use. logprobs: type: boolean - description: (Optional) The log probabilities to use + description: (Optional) The log probabilities to use. max_tokens: type: integer description: >- - (Optional) The maximum number of tokens to generate + (Optional) The maximum number of tokens to generate. n: type: integer description: >- - (Optional) The number of completions to generate + (Optional) The number of completions to generate. presence_penalty: type: number description: >- - (Optional) The penalty for repeated tokens + (Optional) The penalty for repeated tokens. seed: type: integer - description: (Optional) The seed to use + description: (Optional) The seed to use. stop: oneOf: - type: string - type: array items: type: string - description: (Optional) The stop tokens to use + description: (Optional) The stop tokens to use. stream: type: boolean description: >- - (Optional) Whether to stream the response + (Optional) Whether to stream the response. stream_options: type: object additionalProperties: @@ -7161,16 +7750,16 @@ components: - type: string - type: array - type: object - description: (Optional) The stream options to use + description: (Optional) The stream options to use. temperature: type: number - description: (Optional) The temperature to use + description: (Optional) The temperature to use. top_p: type: number - description: (Optional) The top p to use + description: (Optional) The top p to use. user: type: string - description: (Optional) The user to use + description: (Optional) The user to use. guided_choice: type: array items: @@ -7386,12 +7975,16 @@ components: properties: job_uuid: type: string + description: The UUID of the job to create. finetuned_model: type: string + description: The model to fine-tune. algorithm_config: $ref: '#/components/schemas/DPOAlignmentConfig' + description: The algorithm configuration. training_config: $ref: '#/components/schemas/TrainingConfig' + description: The training configuration. hyperparam_search_config: type: object additionalProperties: @@ -7402,6 +7995,7 @@ components: - type: string - type: array - type: object + description: The hyperparam search configuration. logger_config: type: object additionalProperties: @@ -7412,6 +8006,7 @@ components: - type: string - type: array - type: object + description: The logger configuration. additionalProperties: false required: - job_uuid @@ -7467,18 +8062,37 @@ components: properties: query_generator_config: $ref: '#/components/schemas/RAGQueryGeneratorConfig' + description: Configuration for the query generator. max_tokens_in_context: type: integer default: 4096 + description: Maximum number of tokens in the context. max_chunks: type: integer default: 5 + description: Maximum number of chunks to retrieve. + chunk_template: + type: string + default: > + Result {index} + + Content: {chunk.content} + + Metadata: {metadata} + description: >- + Template for formatting each retrieved chunk in the context. Available + placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk + content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent: + {chunk.content}\nMetadata: {metadata}\n" additionalProperties: false required: - query_generator_config - max_tokens_in_context - max_chunks + - chunk_template title: RAGQueryConfig + description: >- + Configuration for the RAG query generation. RAGQueryGeneratorConfig: oneOf: - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig' @@ -7528,8 +8142,11 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to query. query: $ref: '#/components/schemas/InterleavedContent' + description: The query to search for. params: type: object additionalProperties: @@ -7540,6 +8157,7 @@ components: - type: string - type: array - type: object + description: The parameters of the query. additionalProperties: false required: - vector_db_id @@ -7579,6 +8197,109 @@ components: - chunks - scores title: QueryChunksResponse + QueryMetricsRequest: + type: object + properties: + start_time: + type: integer + description: The start time of the metric to query. + end_time: + type: integer + description: The end time of the metric to query. + granularity: + type: string + description: The granularity of the metric to query. + query_type: + type: string + enum: + - range + - instant + description: The type of query to perform. + label_matchers: + type: array + items: + type: object + properties: + name: + type: string + value: + type: string + operator: + type: string + enum: + - '=' + - '!=' + - =~ + - '!~' + title: MetricLabelOperator + default: '=' + additionalProperties: false + required: + - name + - value + - operator + title: MetricLabelMatcher + description: >- + The label matchers to apply to the metric. + additionalProperties: false + required: + - start_time + - query_type + title: QueryMetricsRequest + MetricDataPoint: + type: object + properties: + timestamp: + type: integer + value: + type: number + additionalProperties: false + required: + - timestamp + - value + title: MetricDataPoint + MetricLabel: + type: object + properties: + name: + type: string + value: + type: string + additionalProperties: false + required: + - name + - value + title: MetricLabel + MetricSeries: + type: object + properties: + metric: + type: string + labels: + type: array + items: + $ref: '#/components/schemas/MetricLabel' + values: + type: array + items: + $ref: '#/components/schemas/MetricDataPoint' + additionalProperties: false + required: + - metric + - labels + - values + title: MetricSeries + QueryMetricsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/MetricSeries' + additionalProperties: false + required: + - data + title: QueryMetricsResponse QueryCondition: type: object properties: @@ -7615,12 +8336,16 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the spans. attributes_to_return: type: array items: type: string + description: The attributes to return in the spans. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false required: - attribute_filters @@ -7644,14 +8369,19 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the traces. limit: type: integer + description: The limit of traces to return. offset: type: integer + description: The offset of the traces to return. order_by: type: array items: type: string + description: The order by of the traces to return. additionalProperties: false title: QueryTracesRequest QueryTracesResponse: @@ -7670,16 +8400,25 @@ components: properties: benchmark_id: type: string + description: The ID of the benchmark to register. dataset_id: type: string + description: >- + The ID of the dataset to use for the benchmark. scoring_functions: type: array items: type: string + description: >- + The scoring functions to use for the benchmark. provider_benchmark_id: type: string + description: >- + The ID of the provider benchmark to use for the benchmark. provider_id: type: string + description: >- + The ID of the provider to use for the benchmark. metadata: type: object additionalProperties: @@ -7690,6 +8429,7 @@ components: - type: string - type: array - type: object + description: The metadata to use for the benchmark. additionalProperties: false required: - benchmark_id @@ -7706,7 +8446,7 @@ components: - eval/question-answer - eval/messages-answer description: >- - The purpose of the dataset. One of - "post-training/messages": The dataset + The purpose of the dataset. One of: - "post-training/messages": The dataset contains a messages column with list of messages for post-training. { "messages": [ {"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}, ] } - "eval/question-answer": The dataset @@ -7739,7 +8479,7 @@ components: - type: array - type: object description: >- - The metadata for the dataset. - E.g. {"description": "My dataset"} + The metadata for the dataset. - E.g. {"description": "My dataset"}. dataset_id: type: string description: >- @@ -7754,10 +8494,14 @@ components: properties: model_id: type: string + description: The identifier of the model to register. provider_model_id: type: string + description: >- + The identifier of the model in the provider. provider_id: type: string + description: The identifier of the provider. metadata: type: object additionalProperties: @@ -7768,8 +8512,10 @@ components: - type: string - type: array - type: object + description: Any additional metadata for this model. model_type: $ref: '#/components/schemas/ModelType' + description: The type of model to register. additionalProperties: false required: - model_id @@ -7779,16 +8525,27 @@ components: properties: scoring_fn_id: type: string + description: >- + The ID of the scoring function to register. description: type: string + description: The description of the scoring function. return_type: $ref: '#/components/schemas/ParamType' + description: The return type of the scoring function. provider_scoring_fn_id: type: string + description: >- + The ID of the provider scoring function to use for the scoring function. provider_id: type: string + description: >- + The ID of the provider to use for the scoring function. params: $ref: '#/components/schemas/ScoringFnParams' + description: >- + The parameters for the scoring function for benchmark eval, these can + be overridden for app eval. additionalProperties: false required: - scoring_fn_id @@ -7800,10 +8557,15 @@ components: properties: shield_id: type: string + description: >- + The identifier of the shield to register. provider_shield_id: type: string + description: >- + The identifier of the shield in the provider. provider_id: type: string + description: The identifier of the provider. params: type: object additionalProperties: @@ -7814,6 +8576,7 @@ components: - type: string - type: array - type: object + description: The parameters of the shield. additionalProperties: false required: - shield_id @@ -7823,10 +8586,15 @@ components: properties: toolgroup_id: type: string + description: The ID of the tool group to register. provider_id: type: string + description: >- + The ID of the provider to use for the tool group. mcp_endpoint: $ref: '#/components/schemas/URL' + description: >- + The MCP endpoint to use for the tool group. args: type: object additionalProperties: @@ -7837,6 +8605,8 @@ components: - type: string - type: array - type: object + description: >- + A dictionary of arguments to pass to the tool group. additionalProperties: false required: - toolgroup_id @@ -7847,14 +8617,21 @@ components: properties: vector_db_id: type: string + description: >- + The identifier of the vector database to register. embedding_model: type: string + description: The embedding model to use. embedding_dimension: type: integer + description: The dimension of the embedding model. provider_id: type: string + description: The identifier of the provider. provider_vector_db_id: type: string + description: >- + The identifier of the vector database in the provider. additionalProperties: false required: - vector_db_id @@ -7891,10 +8668,12 @@ components: properties: shield_id: type: string + description: The identifier of the shield to run. messages: type: array items: $ref: '#/components/schemas/Message' + description: The messages to run the shield on. params: type: object additionalProperties: @@ -7905,6 +8684,7 @@ components: - type: string - type: array - type: object + description: The parameters of the shield. additionalProperties: false required: - shield_id @@ -7925,14 +8705,20 @@ components: type: array items: $ref: '#/components/schemas/QueryCondition' + description: >- + The attribute filters to apply to the spans. attributes_to_save: type: array items: type: string + description: The attributes to save to the dataset. dataset_id: type: string + description: >- + The ID of the dataset to save the spans to. max_depth: type: integer + description: The maximum depth of the tree. additionalProperties: false required: - attribute_filters @@ -7987,14 +8773,19 @@ components: properties: dataset_id: type: string + description: The ID of the dataset to score. scoring_functions: type: object additionalProperties: oneOf: - $ref: '#/components/schemas/ScoringFnParams' - type: 'null' + description: >- + The scoring functions to use for the scoring. save_results_dataset: type: boolean + description: >- + Whether to save the results to a dataset. additionalProperties: false required: - dataset_id @@ -8079,8 +8870,10 @@ components: properties: job_uuid: type: string + description: The UUID of the job to create. training_config: $ref: '#/components/schemas/TrainingConfig' + description: The training configuration. hyperparam_search_config: type: object additionalProperties: @@ -8091,6 +8884,7 @@ components: - type: string - type: array - type: object + description: The hyperparam search configuration. logger_config: type: object additionalProperties: @@ -8101,12 +8895,16 @@ components: - type: string - type: array - type: object + description: The logger configuration. model: type: string + description: The model to fine-tune. checkpoint_dir: type: string + description: The directory to save checkpoint(s) to. algorithm_config: $ref: '#/components/schemas/AlgorithmConfig' + description: The algorithm configuration. additionalProperties: false required: - job_uuid diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index b764d4d34..cdaf074b8 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -1050,8 +1050,6 @@ "text/html": [ "
ToolGroup(\n",
               "identifier='builtin::code_interpreter',\n",
-              "provider_id='code-interpreter',\n",
-              "provider_resource_id='builtin::code_interpreter',\n",
               "type='tool_group',\n",
               "args=None,\n",
               "mcp_endpoint=None\n",
@@ -1061,7 +1059,6 @@
             "text/plain": [
               "\u001b[1;35mToolGroup\u001b[0m\u001b[1m(\u001b[0m\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33midentifier\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
-              "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_id\u001b[0m=\u001b[32m'code-interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mprovider_resource_id\u001b[0m=\u001b[32m'builtin::code_interpreter'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'tool_group'\u001b[0m,\n",
               "\u001b[2;32m│   \u001b[0m\u001b[33margs\u001b[0m=\u001b[3;35mNone\u001b[0m,\n",
diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
index 5de7f715e..413b693d1 100644
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@@ -337,9 +337,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: {}\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inline::code-interpreter\n",
-              "  - config: {}\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: {}\n",
@@ -378,10 +375,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: []\n",
@@ -617,9 +610,6 @@
               "    provider_id: tavily-search\n",
               "    provider_type: remote::tavily-search\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-              "    provider_id: code-interpreter\n",
-              "    provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n",
-              "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
               "    provider_id: rag-runtime\n",
               "    provider_type: inline::rag-runtime\n",
               "  - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
@@ -658,10 +648,6 @@
               "  toolgroup_id: builtin::rag\n",
               "- args: null\n",
               "  mcp_endpoint: null\n",
-              "  provider_id: code-interpreter\n",
-              "  toolgroup_id: builtin::code_interpreter\n",
-              "- args: null\n",
-              "  mcp_endpoint: null\n",
               "  provider_id: wolfram-alpha\n",
               "  toolgroup_id: builtin::wolfram_alpha\n",
               "vector_dbs: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
diff --git a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
index 399a3bff1..e70cc3bbe 100644
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@@ -840,7 +840,6 @@
     "    \"memory_optimizations.rst\",\n",
     "    \"chat.rst\",\n",
     "    \"llama3.rst\",\n",
-    "    \"datasets.rst\",\n",
     "    \"qat_finetune.rst\",\n",
     "    \"lora_finetune.rst\",\n",
     "]\n",
@@ -1586,7 +1585,6 @@
     "    \"memory_optimizations.rst\",\n",
     "    \"chat.rst\",\n",
     "    \"llama3.rst\",\n",
-    "    \"datasets.rst\",\n",
     "    \"qat_finetune.rst\",\n",
     "    \"lora_finetune.rst\",\n",
     "]\n",
diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index caa4f17ff..9fc375175 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -44,7 +44,7 @@ def main(output_dir: str):
     if return_type_errors:
         print("\nAPI Method Return Type Validation Errors:\n")
         for error in return_type_errors:
-            print(error)
+            print(error, file=sys.stderr)
         sys.exit(1)
     now = str(datetime.now())
     print(
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index cc594d8d7..5b7a685c1 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -759,7 +759,7 @@ class Generator:
         )
 
         return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
             summary=None,
             # summary=doc_string.short_description,
             description=description,
@@ -805,6 +805,8 @@ class Generator:
         operation_tags: List[Tag] = []
         for cls in endpoint_classes:
             doc_string = parse_type(cls)
+            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
+                continue
             operation_tags.append(
                 Tag(
                     name=cls.__name__,
diff --git a/docs/openapi_generator/pyopenapi/utility.py b/docs/openapi_generator/pyopenapi/utility.py
index db18e8430..12a69050c 100644
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@@ -174,14 +174,64 @@ def _validate_list_parameters_contain_data(method) -> str | None:
         return "does not have a mandatory data attribute containing the list of objects"
 
 
+def _validate_has_ellipsis(method) -> str | None:
+    source = inspect.getsource(method)
+    if "..." not in source and not "NotImplementedError" in source:
+        return "does not contain ellipsis (...) in its implementation"
+
+def _validate_has_return_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is not None and return_type != type(None) and ":returns:" not in source:
+        return "does not have a ':returns:' in its docstring"
+
+def _validate_has_params_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    sig = inspect.signature(method)
+    # Only check if the method has more than one parameter
+    if len(sig.parameters) > 1 and ":param" not in source:
+        return "does not have a ':param' in its docstring"
+
+def _validate_has_no_return_none_in_docstring(method) -> str | None:
+    source = inspect.getsource(method)
+    return_type = method.__annotations__.get('return')
+    if return_type is None and ":returns: None" in source:
+        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
+
+def _validate_docstring_lines_end_with_dot(method) -> str | None:
+    docstring = inspect.getdoc(method)
+    if docstring is None:
+        return None
+
+    lines = docstring.split('\n')
+    for line in lines:
+        line = line.strip()
+        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
+            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
+
 _VALIDATORS = {
     "GET": [
         _validate_api_method_return_type,
         _validate_list_parameters_contain_data,
         _validate_api_method_doesnt_return_list,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_docstring_lines_end_with_dot,
     ],
     "DELETE": [
         _validate_api_delete_method_returns_none,
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring
+    ],
+    "POST": [
+        _validate_has_ellipsis,
+        _validate_has_return_in_docstring,
+        _validate_has_params_in_docstring,
+        _validate_has_no_return_none_in_docstring,
+        _validate_docstring_lines_end_with_dot,
     ],
 }
 
diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index db6303209..dbe90a7fc 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -51,6 +51,7 @@ chunks = [
         "mime_type": "text/plain",
         "metadata": {
             "document_id": "doc1",
+            "author": "Jane Doe",
         },
     },
 ]
@@ -98,6 +99,17 @@ results = client.tool_runtime.rag_tool.query(
 )
 ```
 
+You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
+```python
+# Query documents
+results = client.tool_runtime.rag_tool.query(
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
+    query_config={
+        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+    },
+)
+```
 ### Building RAG-Enhanced Agents
 
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
@@ -115,6 +127,12 @@ agent = Agent(
             "name": "builtin::rag/knowledge_search",
             "args": {
                 "vector_db_ids": [vector_db_id],
+                # Defaults
+                "query_config": {
+                    "chunk_size_in_tokens": 512,
+                    "chunk_overlap_in_tokens": 0,
+                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
+                },
             },
         }
     ],
diff --git a/docs/source/building_applications/tools.md b/docs/source/building_applications/tools.md
index 95c69ffa3..c7af17bfa 100644
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@@ -165,34 +165,6 @@ all_tools = client.tools.list_tools()
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
 
-## Simple Example: Using an Agent with the Code-Interpreter Tool
-
-```python
-from llama_stack_client import Agent
-
-# Instantiate the AI agent with the given configuration
-agent = Agent(
-    client,
-    name="code-interpreter",
-    description="A code interpreter agent for executing Python code snippets",
-    instructions="""
-    You are a highly reliable, concise, and precise assistant.
-    Always show the generated code, never generate your own code, and never anticipate results.
-    """,
-    model="meta-llama/Llama-3.2-3B-Instruct",
-    tools=["builtin::code_interpreter"],
-    max_infer_iters=5,
-)
-
-# Start a session
-session_id = agent.create_session("tool_session")
-
-# Send a query to the AI agent for code execution
-response = agent.create_turn(
-    messages=[{"role": "user", "content": "Run this code: print(3 ** 4 - 5 * 2)"}],
-    session_id=session_id,
-)
-```
 ## Simple Example 2: Using an Agent with the Web Search Tool
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 55c6383b2..501a923dd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -110,6 +110,8 @@ html_theme_options = {
     "canonical_url": "https://github.com/meta-llama/llama-stack",
     "collapse_navigation": False,
     # "style_nav_header_background": "#c3c9d4",
+    'display_version': True,
+    'version_selector': True,
 }
 
 default_dark_mode = False
diff --git a/docs/source/contributing/new_api_provider.md b/docs/source/contributing/new_api_provider.md
index c412a350b..83058896a 100644
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
-- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
+- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 
 
 Here are some example PRs to help you get started:
@@ -33,6 +33,7 @@ Note that each provider's `sample_run_config()` method (in the configuration cla
 
 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
 
+Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
 
 ### 3. Additional end-to-end testing
 
diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 56b8d30a8..d9b73c910 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -178,7 +178,7 @@ image_name: ollama
 image_type: conda
 
 # If some providers are external, you can specify the path to the implementation
-external_providers_dir: /etc/llama-stack/providers.d
+external_providers_dir: ~/.llama/providers.d
 ```
 
 ```
@@ -206,7 +206,7 @@ distribution_spec:
 image_type: container
 image_name: ci-test
 # Path to external provider implementations
-external_providers_dir: /etc/llama-stack/providers.d
+external_providers_dir: ~/.llama/providers.d
 ```
 
 Here's an example for a custom Ollama provider:
@@ -271,7 +271,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 
 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                        [--image-type {conda,container,venv}]
                        config
 
@@ -285,7 +285,6 @@ options:
   --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
   --image-name IMAGE_NAME
                         Name of the image to run. Defaults to the current environment (default: None)
-  --disable-ipv6        Disable IPv6 support (default: False)
   --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
   --tls-keyfile TLS_KEYFILE
                         Path to TLS key file for HTTPS (default: None)
diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md
index 21ec02012..f43039824 100644
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@@ -172,7 +172,7 @@ spec:
       - name: llama-stack
         image: localhost/llama-stack-run-k8s:latest
         imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
         ports:
           - containerPort: 5000
         volumeMounts:
diff --git a/docs/source/distributions/remote_hosted_distro/watsonx.md b/docs/source/distributions/remote_hosted_distro/watsonx.md
index b7c89e9b0..ec1b98059 100644
--- a/docs/source/distributions/remote_hosted_distro/watsonx.md
+++ b/docs/source/distributions/remote_hosted_distro/watsonx.md
@@ -18,7 +18,7 @@ The `llamastack/distribution-watsonx` distribution consists of the following pro
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
-| inference | `remote::watsonx` |
+| inference | `remote::watsonx`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
@@ -70,7 +70,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-watsonx \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env WATSONX_API_KEY=$WATSONX_API_KEY \
   --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md
index 329c9b802..3c4db1b75 100644
--- a/docs/source/distributions/self_hosted_distro/cerebras.md
+++ b/docs/source/distributions/self_hosted_distro/cerebras.md
@@ -52,7 +52,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-cerebras \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md
index 96b0ef478..eded3bdc4 100644
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@@ -23,7 +23,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -155,7 +155,7 @@ docker run \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-dell \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env DEH_URL=$DEH_URL \
diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md
index ef3ec8213..920ce950f 100644
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@@ -144,7 +144,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-nvidia \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 5d8935fe2..4d148feda 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -19,6 +19,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::ollama` |
+| post_training | `inline::huggingface` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
@@ -97,7 +98,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-ollama \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env SAFETY_MODEL=$SAFETY_MODEL \
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 2ff4bad5b..6e7cf410d 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -233,7 +233,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
   llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@@ -255,7 +255,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-remote-vllm \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md
index 873c3075c..aaa8fd3cc 100644
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@@ -16,10 +16,10 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
-| inference | `remote::sambanova` |
+| inference | `remote::sambanova`, `inline::sentence-transformers` |
 | safety | `inline::llama-guard` |
 | telemetry | `inline::meta-reference` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
@@ -28,22 +28,22 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
 The following environment variables can be configured:
 
 - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
-- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
+- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``)
 
 ### Models
 
 The following models are available by default:
 
-- `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
-- `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
-- `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
-- `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
-- `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
-- `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
-- `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
-- `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
-- `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
-- `Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
+- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
+- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
+- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
+- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
+- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
+- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
+- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)`
+- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)`
+- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
 
 
 ### Prerequisite: API Keys
diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
index 7a75aa559..24f9d03ec 100644
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -117,7 +117,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-tgi \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md
index a1504f249..e40a4903a 100644
--- a/docs/source/getting_started/detailed_tutorial.md
+++ b/docs/source/getting_started/detailed_tutorial.md
@@ -42,7 +42,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
 Setup your virtual environment.
 
 ```bash
-uv venv --python 3.10
+uv sync --python 3.10
 source .venv/bin/activate
 ```
 ## Step 2:  Run Llama Stack
@@ -445,7 +445,6 @@ from llama_stack_client import LlamaStackClient
 from llama_stack_client import Agent, AgentEventLogger
 from llama_stack_client.types import Document
 import uuid
-from termcolor import cprint
 
 client = LlamaStackClient(base_url="http://localhost:8321")
 
@@ -463,7 +462,6 @@ urls = [
     "memory_optimizations.rst",
     "chat.rst",
     "llama3.rst",
-    "datasets.rst",
     "qat_finetune.rst",
     "lora_finetune.rst",
 ]
diff --git a/docs/source/providers/external.md b/docs/source/providers/external.md
index ee36ebc3c..55211ac5f 100644
--- a/docs/source/providers/external.md
+++ b/docs/source/providers/external.md
@@ -10,7 +10,7 @@ Llama Stack supports external providers that live outside of the main codebase.
 To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications:
 
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 
 ## Directory Structure
@@ -53,7 +53,7 @@ Here's a list of known external providers that you can use with Llama Stack:
 | Name | Description | API | Type | Repository |
 |------|-------------|-----|------|------------|
 | KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
-| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
+| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
 | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
 | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
 
@@ -182,7 +182,7 @@ dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 3. Create the provider specification:
 
 ```yaml
-# /etc/llama-stack/providers.d/remote/inference/custom_ollama.yaml
+# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
 adapter:
   adapter_type: custom_ollama
   pip_packages: ["ollama", "aiohttp"]
@@ -201,7 +201,7 @@ uv pip install -e .
 5. Configure Llama Stack to use external providers:
 
 ```yaml
-external_providers_dir: /etc/llama-stack/providers.d/
+external_providers_dir: ~/.llama/providers.d/
 ```
 
 The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index 0b84027f0..cd4dd4cd7 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -253,8 +253,6 @@ llama-stack-client toolgroups list
 +---------------------------+------------------+------+---------------+
 | identifier                | provider_id      | args | mcp_endpoint  |
 +===========================+==================+======+===============+
-| builtin::code_interpreter | code-interpreter | None | None          |
-+---------------------------+------------------+------+---------------+
 | builtin::rag              | rag-runtime      | None | None          |
 +---------------------------+------------------+------+---------------+
 | builtin::websearch        | tavily-search    | None | None          |
diff --git a/install.sh b/install.sh
index 614dbc2f2..e424925a6 100755
--- a/install.sh
+++ b/install.sh
@@ -38,6 +38,67 @@ wait_for_service() {
   return 0
 }
 
+usage() {
+    cat << EOF
+📚 Llama-Stack Deployment Script
+
+Description:
+    This script sets up and deploys Llama-Stack with Ollama integration in containers.
+    It handles both Docker and Podman runtimes and includes automatic platform detection.
+
+Usage:
+    $(basename "$0") [OPTIONS]
+
+Options:
+    -p, --port PORT            Server port for Llama-Stack (default: ${PORT})
+    -o, --ollama-port PORT     Ollama service port (default: ${OLLAMA_PORT})
+    -m, --model MODEL          Model alias to use (default: ${MODEL_ALIAS})
+    -i, --image IMAGE          Server image (default: ${SERVER_IMAGE})
+    -t, --timeout SECONDS      Service wait timeout in seconds (default: ${WAIT_TIMEOUT})
+    -h, --help                 Show this help message
+
+For more information:
+    Documentation: https://llama-stack.readthedocs.io/
+    GitHub: https://github.com/meta-llama/llama-stack
+
+Report issues:
+    https://github.com/meta-llama/llama-stack/issues
+EOF
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -p|--port)
+            PORT="$2"
+            shift 2
+            ;;
+        -o|--ollama-port)
+            OLLAMA_PORT="$2"
+            shift 2
+            ;;
+        -m|--model)
+            MODEL_ALIAS="$2"
+            shift 2
+            ;;
+        -i|--image)
+            SERVER_IMAGE="$2"
+            shift 2
+            ;;
+        -t|--timeout)
+            WAIT_TIMEOUT="$2"
+            shift 2
+            ;;
+        *)
+            die "Unknown option: $1"
+            ;;
+    esac
+done
+
 if command -v docker &> /dev/null; then
   ENGINE="docker"
 elif command -v podman &> /dev/null; then
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index 30b37e98f..b2f85336c 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
 from collections.abc import AsyncIterator
 from datetime import datetime
 from enum import Enum
@@ -12,6 +13,7 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, ConfigDict, Field
 
 from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
+from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
     CompletionMessage,
     ResponseFormat,
@@ -29,12 +31,20 @@ from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
 from .openai_responses import (
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
 )
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
 
 class Attachment(BaseModel):
     """An attachment to an agent turn.
@@ -73,7 +83,7 @@ class StepCommon(BaseModel):
     completed_at: datetime | None = None
 
 
-class StepType(Enum):
+class StepType(StrEnum):
     """Type of the step in an agent turn.
 
     :cvar inference: The step is an inference step that calls an LLM.
@@ -97,7 +107,7 @@ class InferenceStep(StepCommon):
 
     model_config = ConfigDict(protected_namespaces=())
 
-    step_type: Literal[StepType.inference.value] = StepType.inference.value
+    step_type: Literal[StepType.inference] = StepType.inference
     model_response: CompletionMessage
 
 
@@ -109,7 +119,7 @@ class ToolExecutionStep(StepCommon):
     :param tool_responses: The tool responses from the tool calls.
     """
 
-    step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
+    step_type: Literal[StepType.tool_execution] = StepType.tool_execution
     tool_calls: list[ToolCall]
     tool_responses: list[ToolResponse]
 
@@ -121,7 +131,7 @@ class ShieldCallStep(StepCommon):
     :param violation: The violation from the shield call.
     """
 
-    step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
+    step_type: Literal[StepType.shield_call] = StepType.shield_call
     violation: SafetyViolation | None
 
 
@@ -133,7 +143,7 @@ class MemoryRetrievalStep(StepCommon):
     :param inserted_context: The context retrieved from the vector databases.
     """
 
-    step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
+    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
     # TODO: should this be List[str]?
     vector_db_ids: str
     inserted_context: InterleavedContent
@@ -154,7 +164,7 @@ class Turn(BaseModel):
     input_messages: list[UserMessage | ToolResponseMessage]
     steps: list[Step]
     output_message: CompletionMessage
-    output_attachments: list[Attachment] | None = Field(default_factory=list)
+    output_attachments: list[Attachment] | None = Field(default_factory=lambda: [])
 
     started_at: datetime
     completed_at: datetime | None = None
@@ -182,10 +192,10 @@ register_schema(AgentToolGroup, name="AgentTool")
 class AgentConfigCommon(BaseModel):
     sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
 
-    input_shields: list[str] | None = Field(default_factory=list)
-    output_shields: list[str] | None = Field(default_factory=list)
-    toolgroups: list[AgentToolGroup] | None = Field(default_factory=list)
-    client_tools: list[ToolDef] | None = Field(default_factory=list)
+    input_shields: list[str] | None = Field(default_factory=lambda: [])
+    output_shields: list[str] | None = Field(default_factory=lambda: [])
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
+    client_tools: list[ToolDef] | None = Field(default_factory=lambda: [])
     tool_choice: ToolChoice | None = Field(default=None, deprecated="use tool_config instead")
     tool_prompt_format: ToolPromptFormat | None = Field(default=None, deprecated="use tool_config instead")
     tool_config: ToolConfig | None = Field(default=None)
@@ -232,21 +242,11 @@ class Agent(BaseModel):
     created_at: datetime
 
 
-@json_schema_type
-class ListAgentsResponse(BaseModel):
-    data: list[Agent]
-
-
-@json_schema_type
-class ListAgentSessionsResponse(BaseModel):
-    data: list[Session]
-
-
 class AgentConfigOverridablePerTurn(AgentConfigCommon):
     instructions: str | None = None
 
 
-class AgentTurnResponseEventType(Enum):
+class AgentTurnResponseEventType(StrEnum):
     step_start = "step_start"
     step_complete = "step_complete"
     step_progress = "step_progress"
@@ -258,15 +258,15 @@ class AgentTurnResponseEventType(Enum):
 
 @json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
+    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
     step_type: StepType
     step_id: str
-    metadata: dict[str, Any] | None = Field(default_factory=dict)
+    metadata: dict[str, Any] | None = Field(default_factory=lambda: {})
 
 
 @json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
+    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
     step_type: StepType
     step_id: str
     step_details: Step
@@ -276,7 +276,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
 class AgentTurnResponseStepProgressPayload(BaseModel):
     model_config = ConfigDict(protected_namespaces=())
 
-    event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
+    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
     step_type: StepType
     step_id: str
 
@@ -285,21 +285,19 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
 
 @json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
+    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
     turn_id: str
 
 
 @json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
+    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
     turn: Turn
 
 
 @json_schema_type
 class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input.value] = (
-        AgentTurnResponseEventType.turn_awaiting_input.value
-    )
+    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
     turn: Turn
 
 
@@ -341,7 +339,7 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
     messages: list[UserMessage | ToolResponseMessage]
 
     documents: list[Document] | None = None
-    toolgroups: list[AgentToolGroup] | None = None
+    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
 
     stream: bool | None = False
     tool_config: ToolConfig | None = None
@@ -415,8 +413,9 @@ class Agents(Protocol):
         :param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
         :param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
         :returns: If stream=False, returns a Turn object.
-                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk
+                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.
         """
+        ...
 
     @webmethod(
         route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
@@ -510,6 +509,7 @@ class Agents(Protocol):
         :param session_id: The ID of the session to get.
         :param agent_id: The ID of the agent to get the session for.
         :param turn_ids: (Optional) List of turn IDs to filter the session by.
+        :returns: A Session.
         """
         ...
 
@@ -519,7 +519,7 @@ class Agents(Protocol):
         session_id: str,
         agent_id: str,
     ) -> None:
-        """Delete an agent session by its ID.
+        """Delete an agent session by its ID and its associated turns.
 
         :param session_id: The ID of the session to delete.
         :param agent_id: The ID of the agent to delete the session for.
@@ -531,17 +531,19 @@ class Agents(Protocol):
         self,
         agent_id: str,
     ) -> None:
-        """Delete an agent by its ID.
+        """Delete an agent by its ID and its associated sessions and turns.
 
         :param agent_id: The ID of the agent to delete.
         """
         ...
 
     @webmethod(route="/agents", method="GET")
-    async def list_agents(self) -> ListAgentsResponse:
+    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
         """List all agents.
 
-        :returns: A ListAgentsResponse.
+        :param start_index: The index to start the pagination from.
+        :param limit: The number of agents to return.
+        :returns: A PaginatedResponse.
         """
         ...
 
@@ -558,11 +560,15 @@ class Agents(Protocol):
     async def list_agent_sessions(
         self,
         agent_id: str,
-    ) -> ListAgentSessionsResponse:
+        start_index: int | None = None,
+        limit: int | None = None,
+    ) -> PaginatedResponse:
         """List all session(s) of a given agent.
 
         :param agent_id: The ID of the agent to list sessions for.
-        :returns: A ListAgentSessionsResponse.
+        :param start_index: The index to start the pagination from.
+        :param limit: The number of sessions to return.
+        :returns: A PaginatedResponse.
         """
         ...
 
@@ -588,7 +594,7 @@ class Agents(Protocol):
     @webmethod(route="/openai/v1/responses", method="POST")
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -601,4 +607,6 @@ class Agents(Protocol):
         :param input: Input message(s) to create the response.
         :param model: The underlying LLM used for completions.
         :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
+        :returns: An OpenAIResponseObject.
         """
+        ...
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
index 8e11b2123..dcf0c7f9c 100644
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Annotated, Literal
+from typing import Annotated, Any, Literal
 
 from pydantic import BaseModel, Field
 
@@ -17,6 +17,28 @@ class OpenAIResponseError(BaseModel):
     message: str
 
 
+@json_schema_type
+class OpenAIResponseInputMessageContentText(BaseModel):
+    text: str
+    type: Literal["input_text"] = "input_text"
+
+
+@json_schema_type
+class OpenAIResponseInputMessageContentImage(BaseModel):
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
+    type: Literal["input_image"] = "input_image"
+    # TODO: handle file_id
+    image_url: str | None = None
+
+
+# TODO: handle file content types
+OpenAIResponseInputMessageContent = Annotated[
+    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
+    Field(discriminator="type"),
+]
+register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
+
+
 @json_schema_type
 class OpenAIResponseOutputMessageContentOutputText(BaseModel):
     text: str
@@ -31,13 +53,22 @@ register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMe
 
 
 @json_schema_type
-class OpenAIResponseOutputMessage(BaseModel):
-    id: str
-    content: list[OpenAIResponseOutputMessageContent]
-    role: Literal["assistant"] = "assistant"
-    status: str
+class OpenAIResponseMessage(BaseModel):
+    """
+    Corresponds to the various Message types in the Responses API.
+    They are all under one type because the Responses API gives them all
+    the same "type" value, and there is no way to tell them apart in certain
+    scenarios.
+    """
+
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
     type: Literal["message"] = "message"
 
+    # The fields below are not used in all scenarios, but are required in others.
+    id: str | None = None
+    status: str | None = None
+
 
 @json_schema_type
 class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
@@ -46,8 +77,18 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
     type: Literal["web_search_call"] = "web_search_call"
 
 
+@json_schema_type
+class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
+    arguments: str
+    call_id: str
+    name: str
+    type: Literal["function_call"] = "function_call"
+    id: str
+    status: str
+
+
 OpenAIResponseOutput = Annotated[
-    OpenAIResponseOutputMessage | OpenAIResponseOutputMessageWebSearchToolCall,
+    OpenAIResponseMessage | OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
@@ -90,32 +131,29 @@ register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
 
 
 @json_schema_type
-class OpenAIResponseInputMessageContentText(BaseModel):
-    text: str
-    type: Literal["input_text"] = "input_text"
+class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
+    """
+    This represents the output of a function call that gets passed back to the model.
+    """
+
+    call_id: str
+    output: str
+    type: Literal["function_call_output"] = "function_call_output"
+    id: str | None = None
+    status: str | None = None
 
 
-@json_schema_type
-class OpenAIResponseInputMessageContentImage(BaseModel):
-    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
-    type: Literal["input_image"] = "input_image"
-    # TODO: handle file_id
-    image_url: str | None = None
-
-
-# TODO: handle file content types
-OpenAIResponseInputMessageContent = Annotated[
-    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
-    Field(discriminator="type"),
+OpenAIResponseInput = Annotated[
+    # Responses API allows output messages to be passed in as input
+    OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFunctionToolCall
+    | OpenAIResponseInputFunctionToolCallOutput
+    |
+    # Fallback to the generic message type as a last resort
+    OpenAIResponseMessage,
+    Field(union_mode="left_to_right"),
 ]
-register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
-
-
-@json_schema_type
-class OpenAIResponseInputMessage(BaseModel):
-    content: str | list[OpenAIResponseInputMessageContent]
-    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
-    type: Literal["message"] | None = "message"
+register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
 
 
 @json_schema_type
@@ -126,8 +164,35 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
     # TODO: add user_location
 
 
+@json_schema_type
+class OpenAIResponseInputToolFunction(BaseModel):
+    type: Literal["function"] = "function"
+    name: str
+    description: str | None = None
+    parameters: dict[str, Any] | None
+    strict: bool | None = None
+
+
+class FileSearchRankingOptions(BaseModel):
+    ranker: str | None = None
+    score_threshold: float | None = Field(default=0.0, ge=0.0, le=1.0)
+
+
+@json_schema_type
+class OpenAIResponseInputToolFileSearch(BaseModel):
+    type: Literal["file_search"] = "file_search"
+    vector_store_id: list[str]
+    ranking_options: FileSearchRankingOptions | None = None
+    # TODO: add filters
+
+
 OpenAIResponseInputTool = Annotated[
-    OpenAIResponseInputToolWebSearch,
+    OpenAIResponseInputToolWebSearch | OpenAIResponseInputToolFileSearch | OpenAIResponseInputToolFunction,
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
+
+
+class OpenAIResponseInputItemList(BaseModel):
+    data: list[OpenAIResponseInput]
+    object: Literal["list"] = "list"
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index 79bc73e4c..b2aa637e2 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -38,7 +38,17 @@ class BatchInference(Protocol):
         sampling_params: SamplingParams | None = None,
         response_format: ResponseFormat | None = None,
         logprobs: LogProbConfig | None = None,
-    ) -> Job: ...
+    ) -> Job:
+        """Generate completions for a batch of content.
+
+        :param model: The model to use for the completion.
+        :param content_batch: The content to complete.
+        :param sampling_params: The sampling parameters to use for the completion.
+        :param response_format: The response format to use for the completion.
+        :param logprobs: The logprobs to use for the completion.
+        :returns: A job for the completion.
+        """
+        ...
 
     @webmethod(route="/batch-inference/chat-completion", method="POST")
     async def chat_completion(
@@ -52,4 +62,17 @@ class BatchInference(Protocol):
         tool_prompt_format: ToolPromptFormat | None = None,
         response_format: ResponseFormat | None = None,
         logprobs: LogProbConfig | None = None,
-    ) -> Job: ...
+    ) -> Job:
+        """Generate chat completions for a batch of messages.
+
+        :param model: The model to use for the chat completion.
+        :param messages_batch: The messages to complete.
+        :param sampling_params: The sampling parameters to use for the completion.
+        :param tools: The tools to use for the chat completion.
+        :param tool_choice: The tool choice to use for the chat completion.
+        :param tool_prompt_format: The tool prompt format to use for the chat completion.
+        :param response_format: The response format to use for the chat completion.
+        :param logprobs: The logprobs to use for the chat completion.
+        :returns: A job for the chat completion.
+        """
+        ...
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
index 1bba42d20..d80c767f8 100644
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@@ -22,14 +22,14 @@ class CommonBenchmarkFields(BaseModel):
 
 @json_schema_type
 class Benchmark(CommonBenchmarkFields, Resource):
-    type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
+    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
 
     @property
     def benchmark_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_benchmark_id(self) -> str:
+    def provider_benchmark_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -46,13 +46,24 @@ class ListBenchmarksResponse(BaseModel):
 @runtime_checkable
 class Benchmarks(Protocol):
     @webmethod(route="/eval/benchmarks", method="GET")
-    async def list_benchmarks(self) -> ListBenchmarksResponse: ...
+    async def list_benchmarks(self) -> ListBenchmarksResponse:
+        """List all benchmarks.
+
+        :returns: A ListBenchmarksResponse.
+        """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
     async def get_benchmark(
         self,
         benchmark_id: str,
-    ) -> Benchmark: ...
+    ) -> Benchmark:
+        """Get a benchmark by its ID.
+
+        :param benchmark_id: The ID of the benchmark to get.
+        :returns: A Benchmark.
+        """
+        ...
 
     @webmethod(route="/eval/benchmarks", method="POST")
     async def register_benchmark(
@@ -63,4 +74,14 @@ class Benchmarks(Protocol):
         provider_benchmark_id: str | None = None,
         provider_id: str | None = None,
         metadata: dict[str, Any] | None = None,
-    ) -> None: ...
+    ) -> None:
+        """Register a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to register.
+        :param dataset_id: The ID of the dataset to use for the benchmark.
+        :param scoring_functions: The scoring functions to use for the benchmark.
+        :param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
+        :param provider_id: The ID of the provider to use for the benchmark.
+        :param metadata: The metadata to use for the benchmark.
+        """
+        ...
diff --git a/llama_stack/apis/common/content_types.py b/llama_stack/apis/common/content_types.py
index b9ef033dd..8bcb781f7 100644
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@@ -28,7 +28,7 @@ class _URLOrData(BaseModel):
 
     url: URL | None = None
     # data is a base64 encoded string, hint with contentEncoding=base64
-    data: str | None = Field(contentEncoding="base64", default=None)
+    data: str | None = Field(default=None, json_schema_extra={"contentEncoding": "base64"})
 
     @model_validator(mode="before")
     @classmethod
diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py
index 6d160a043..1183983cc 100644
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@@ -34,14 +34,21 @@ class DatasetIO(Protocol):
         - limit: Number of items to return. If None or -1, returns all items.
 
         The response includes:
-        - data: List of items for the current page
-        - has_more: Whether there are more items available after this set
+        - data: List of items for the current page.
+        - has_more: Whether there are more items available after this set.
 
         :param dataset_id: The ID of the dataset to get the rows from.
         :param start_index: Index into dataset for the first row to get. Get all rows if None.
         :param limit: The number of rows to get.
+        :returns: A PaginatedResponse.
         """
         ...
 
     @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None: ...
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
+        """Append rows to a dataset.
+
+        :param dataset_id: The ID of the dataset to append the rows to.
+        :param rows: The rows to append to the dataset.
+        """
+        ...
diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py
index 796217557..e3de3d5cb 100644
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@@ -106,14 +106,14 @@ class CommonDatasetFields(BaseModel):
 
 @json_schema_type
 class Dataset(CommonDatasetFields, Resource):
-    type: Literal[ResourceType.dataset.value] = ResourceType.dataset.value
+    type: Literal[ResourceType.dataset] = ResourceType.dataset
 
     @property
     def dataset_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_dataset_id(self) -> str:
+    def provider_dataset_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -137,7 +137,8 @@ class Datasets(Protocol):
         """
         Register a new dataset.
 
-        :param purpose: The purpose of the dataset. One of
+        :param purpose: The purpose of the dataset.
+        One of:
             - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
                 {
                     "messages": [
@@ -188,8 +189,9 @@ class Datasets(Protocol):
                ]
            }
         :param metadata: The metadata for the dataset.
-           - E.g. {"description": "My dataset"}
+           - E.g. {"description": "My dataset"}.
         :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
+        :returns: A Dataset.
         """
         ...
 
@@ -197,13 +199,29 @@ class Datasets(Protocol):
     async def get_dataset(
         self,
         dataset_id: str,
-    ) -> Dataset: ...
+    ) -> Dataset:
+        """Get a dataset by its ID.
+
+        :param dataset_id: The ID of the dataset to get.
+        :returns: A Dataset.
+        """
+        ...
 
     @webmethod(route="/datasets", method="GET")
-    async def list_datasets(self) -> ListDatasetsResponse: ...
+    async def list_datasets(self) -> ListDatasetsResponse:
+        """List all datasets.
+
+        :returns: A ListDatasetsResponse.
+        """
+        ...
 
     @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
     async def unregister_dataset(
         self,
         dataset_id: str,
-    ) -> None: ...
+    ) -> None:
+        """Unregister a dataset by its ID.
+
+        :param dataset_id: The ID of the dataset to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py
index 23ca89a94..83a0a8e56 100644
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@@ -93,8 +93,9 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param benchmark_config: The configuration for the benchmark.
-        :return: The job that was created to run the evaluation.
+        :returns: The job that was created to run the evaluation.
         """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
     async def evaluate_rows(
@@ -110,8 +111,9 @@ class Eval(Protocol):
         :param input_rows: The rows to evaluate.
         :param scoring_functions: The scoring functions to use for the evaluation.
         :param benchmark_config: The configuration for the benchmark.
-        :return: EvaluateResponse object containing generations and scores
+        :returns: EvaluateResponse object containing generations and scores.
         """
+        ...
 
     @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
     async def job_status(self, benchmark_id: str, job_id: str) -> Job:
@@ -119,7 +121,7 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param job_id: The ID of the job to get the status of.
-        :return: The status of the evaluationjob.
+        :returns: The status of the evaluation job.
         """
         ...
 
@@ -138,5 +140,6 @@ class Eval(Protocol):
 
         :param benchmark_id: The ID of the benchmark to run the evaluation on.
         :param job_id: The ID of the job to get the result of.
-        :return: The result of the job.
+        :returns: The result of the job.
         """
+        ...
diff --git a/llama_stack/apis/files/files.py b/llama_stack/apis/files/files.py
index 4a9b49978..1d762a68a 100644
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@@ -91,10 +91,11 @@ class Files(Protocol):
         """
         Create a new upload session for a file identified by a bucket and key.
 
-        :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
-        :param mime_type: MIME type of the file
-        :param size: File size in bytes
+        :param bucket: Bucket under which the file is stored (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
+        :param mime_type: MIME type of the file.
+        :param size: File size in bytes.
+        :returns: A FileUploadResponse.
         """
         ...
 
@@ -107,7 +108,8 @@ class Files(Protocol):
         Upload file content to an existing upload session.
         On the server, request body will have the raw bytes that are uploaded.
 
-        :param upload_id: ID of the upload session
+        :param upload_id: ID of the upload session.
+        :returns: A FileResponse or None if the upload is not complete.
         """
         ...
 
@@ -117,9 +119,10 @@ class Files(Protocol):
         upload_id: str,
     ) -> FileUploadResponse:
         """
-        Returns information about an existsing upload session
+        Returns information about an existsing upload session.
 
-        :param upload_id: ID of the upload session
+        :param upload_id: ID of the upload session.
+        :returns: A FileUploadResponse.
         """
         ...
 
@@ -130,6 +133,9 @@ class Files(Protocol):
     ) -> ListBucketResponse:
         """
         List all buckets.
+
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :returns: A ListBucketResponse.
         """
         ...
 
@@ -141,7 +147,8 @@ class Files(Protocol):
         """
         List all files in a bucket.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :returns: A ListFileResponse.
         """
         ...
 
@@ -154,8 +161,9 @@ class Files(Protocol):
         """
         Get a file info identified by a bucket and key.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
+        :returns: A FileResponse.
         """
         ...
 
@@ -168,7 +176,7 @@ class Files(Protocol):
         """
         Delete a file identified by a bucket and key.
 
-        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-)
-        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
+        :param bucket: Bucket name (valid chars: a-zA-Z0-9_-).
+        :param key: Key under which the file is stored (valid chars: a-zA-Z0-9_-/.).
         """
         ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index dbcd1d019..7f8b20952 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
 from collections.abc import AsyncIterator
 from enum import Enum
 from typing import (
@@ -35,6 +36,16 @@ register_schema(ToolCall)
 register_schema(ToolParamDefinition)
 register_schema(ToolDefinition)
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
 
 @json_schema_type
 class GreedySamplingStrategy(BaseModel):
@@ -187,7 +198,7 @@ class CompletionMessage(BaseModel):
     role: Literal["assistant"] = "assistant"
     content: InterleavedContent
     stop_reason: StopReason
-    tool_calls: list[ToolCall] | None = Field(default_factory=list)
+    tool_calls: list[ToolCall] | None = Field(default_factory=lambda: [])
 
 
 Message = Annotated[
@@ -267,7 +278,7 @@ class ChatCompletionResponseEvent(BaseModel):
     stop_reason: StopReason | None = None
 
 
-class ResponseFormatType(Enum):
+class ResponseFormatType(StrEnum):
     """Types of formats for structured (guided) decoding.
 
     :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
@@ -286,7 +297,7 @@ class JsonSchemaResponseFormat(BaseModel):
     :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
     """
 
-    type: Literal[ResponseFormatType.json_schema.value] = ResponseFormatType.json_schema.value
+    type: Literal[ResponseFormatType.json_schema] = ResponseFormatType.json_schema
     json_schema: dict[str, Any]
 
 
@@ -298,7 +309,7 @@ class GrammarResponseFormat(BaseModel):
     :param bnf: The BNF grammar specification the response should conform to
     """
 
-    type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
+    type: Literal[ResponseFormatType.grammar] = ResponseFormatType.grammar
     bnf: dict[str, Any]
 
 
@@ -394,7 +405,7 @@ class ChatCompletionRequest(BaseModel):
     messages: list[Message]
     sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
 
-    tools: list[ToolDefinition] | None = Field(default_factory=list)
+    tools: list[ToolDefinition] | None = Field(default_factory=lambda: [])
     tool_config: ToolConfig | None = Field(default_factory=ToolConfig)
 
     response_format: ResponseFormat | None = None
@@ -567,14 +578,14 @@ class OpenAIResponseFormatText(BaseModel):
 @json_schema_type
 class OpenAIJSONSchema(TypedDict, total=False):
     name: str
-    description: str | None = None
-    strict: bool | None = None
+    description: str | None
+    strict: bool | None
 
     # Pydantic BaseModel cannot be used with a schema param, since it already
     # has one. And, we don't want to alias here because then have to handle
     # that alias when converting to OpenAI params. So, to support schema,
     # we use a TypedDict.
-    schema: dict[str, Any] | None = None
+    schema: dict[str, Any] | None
 
 
 @json_schema_type
@@ -809,15 +820,32 @@ class BatchChatCompletionResponse(BaseModel):
     batch: list[ChatCompletionResponse]
 
 
+class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
+    input_messages: list[OpenAIMessageParam]
+
+
+@json_schema_type
+class ListOpenAIChatCompletionResponse(BaseModel):
+    data: list[OpenAICompletionWithInputMessages]
+    has_more: bool
+    first_id: str
+    last_id: str
+    object: Literal["list"] = "list"
+
+
+class Order(Enum):
+    asc = "asc"
+    desc = "desc"
+
+
 @runtime_checkable
 @trace_protocol
-class Inference(Protocol):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
-
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
+class InferenceProvider(Protocol):
     """
+    This protocol defines the interface that should be implemented by all inference providers.
+    """
+
+    API_NAMESPACE: str = "Inference"
 
     model_store: ModelStore | None = None
 
@@ -834,13 +862,13 @@ class Inference(Protocol):
         """Generate a completion for the given content using the specified model.
 
         :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content: The content to generate a completion for
-        :param sampling_params: (Optional) Parameters to control the sampling strategy
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param content: The content to generate a completion for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
         :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
         :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
         :returns: If stream=False, returns a CompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
         """
         ...
 
@@ -853,6 +881,15 @@ class Inference(Protocol):
         response_format: ResponseFormat | None = None,
         logprobs: LogProbConfig | None = None,
     ) -> BatchCompletionResponse:
+        """Generate completions for a batch of content using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param content_batch: The content to generate completions for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: A BatchCompletionResponse with the full completions.
+        """
         raise NotImplementedError("Batch completion is not implemented")
 
     @webmethod(route="/inference/chat-completion", method="POST")
@@ -872,9 +909,9 @@ class Inference(Protocol):
         """Generate a chat completion for the given messages using the specified model.
 
         :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation
-        :param sampling_params: Parameters to control the sampling strategy
-        :param tools: (Optional) List of tool definitions available to the model
+        :param messages: List of messages in the conversation.
+        :param sampling_params: Parameters to control the sampling strategy.
+        :param tools: (Optional) List of tool definitions available to the model.
         :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
             .. deprecated::
                Use tool_config instead.
@@ -891,7 +928,7 @@ class Inference(Protocol):
         :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
         :param tool_config: (Optional) Configuration for tool use.
         :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
         """
         ...
 
@@ -906,6 +943,17 @@ class Inference(Protocol):
         response_format: ResponseFormat | None = None,
         logprobs: LogProbConfig | None = None,
     ) -> BatchChatCompletionResponse:
+        """Generate chat completions for a batch of messages using the specified model.
+
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
+        :param messages_batch: The messages to generate completions for.
+        :param sampling_params: (Optional) Parameters to control the sampling strategy.
+        :param tools: (Optional) List of tool definitions available to the model.
+        :param tool_config: (Optional) Configuration for tool use.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: A BatchChatCompletionResponse with the full completions.
+        """
         raise NotImplementedError("Batch chat completion is not implemented")
 
     @webmethod(route="/inference/embeddings", method="POST")
@@ -924,7 +972,7 @@ class Inference(Protocol):
         :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
         :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
         :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
-        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
         """
         ...
 
@@ -956,22 +1004,23 @@ class Inference(Protocol):
         """Generate an OpenAI-compatible completion for the given prompt using the specified model.
 
         :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param prompt: The prompt to generate a completion for
-        :param best_of: (Optional) The number of completions to generate
-        :param echo: (Optional) Whether to echo the prompt
-        :param frequency_penalty: (Optional) The penalty for repeated tokens
-        :param logit_bias: (Optional) The logit bias to use
-        :param logprobs: (Optional) The log probabilities to use
-        :param max_tokens: (Optional) The maximum number of tokens to generate
-        :param n: (Optional) The number of completions to generate
-        :param presence_penalty: (Optional) The penalty for repeated tokens
-        :param seed: (Optional) The seed to use
-        :param stop: (Optional) The stop tokens to use
-        :param stream: (Optional) Whether to stream the response
-        :param stream_options: (Optional) The stream options to use
-        :param temperature: (Optional) The temperature to use
-        :param top_p: (Optional) The top p to use
-        :param user: (Optional) The user to use
+        :param prompt: The prompt to generate a completion for.
+        :param best_of: (Optional) The number of completions to generate.
+        :param echo: (Optional) Whether to echo the prompt.
+        :param frequency_penalty: (Optional) The penalty for repeated tokens.
+        :param logit_bias: (Optional) The logit bias to use.
+        :param logprobs: (Optional) The log probabilities to use.
+        :param max_tokens: (Optional) The maximum number of tokens to generate.
+        :param n: (Optional) The number of completions to generate.
+        :param presence_penalty: (Optional) The penalty for repeated tokens.
+        :param seed: (Optional) The seed to use.
+        :param stop: (Optional) The stop tokens to use.
+        :param stream: (Optional) Whether to stream the response.
+        :param stream_options: (Optional) The stream options to use.
+        :param temperature: (Optional) The temperature to use.
+        :param top_p: (Optional) The top p to use.
+        :param user: (Optional) The user to use.
+        :returns: An OpenAICompletion.
         """
         ...
 
@@ -1005,27 +1054,64 @@ class Inference(Protocol):
         """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
 
         :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation
-        :param frequency_penalty: (Optional) The penalty for repeated tokens
-        :param function_call: (Optional) The function call to use
-        :param functions: (Optional) List of functions to use
-        :param logit_bias: (Optional) The logit bias to use
-        :param logprobs: (Optional) The log probabilities to use
-        :param max_completion_tokens: (Optional) The maximum number of tokens to generate
-        :param max_tokens: (Optional) The maximum number of tokens to generate
-        :param n: (Optional) The number of completions to generate
-        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls
-        :param presence_penalty: (Optional) The penalty for repeated tokens
-        :param response_format: (Optional) The response format to use
-        :param seed: (Optional) The seed to use
-        :param stop: (Optional) The stop tokens to use
-        :param stream: (Optional) Whether to stream the response
-        :param stream_options: (Optional) The stream options to use
-        :param temperature: (Optional) The temperature to use
-        :param tool_choice: (Optional) The tool choice to use
-        :param tools: (Optional) The tools to use
-        :param top_logprobs: (Optional) The top log probabilities to use
-        :param top_p: (Optional) The top p to use
-        :param user: (Optional) The user to use
+        :param messages: List of messages in the conversation.
+        :param frequency_penalty: (Optional) The penalty for repeated tokens.
+        :param function_call: (Optional) The function call to use.
+        :param functions: (Optional) List of functions to use.
+        :param logit_bias: (Optional) The logit bias to use.
+        :param logprobs: (Optional) The log probabilities to use.
+        :param max_completion_tokens: (Optional) The maximum number of tokens to generate.
+        :param max_tokens: (Optional) The maximum number of tokens to generate.
+        :param n: (Optional) The number of completions to generate.
+        :param parallel_tool_calls: (Optional) Whether to parallelize tool calls.
+        :param presence_penalty: (Optional) The penalty for repeated tokens.
+        :param response_format: (Optional) The response format to use.
+        :param seed: (Optional) The seed to use.
+        :param stop: (Optional) The stop tokens to use.
+        :param stream: (Optional) Whether to stream the response.
+        :param stream_options: (Optional) The stream options to use.
+        :param temperature: (Optional) The temperature to use.
+        :param tool_choice: (Optional) The tool choice to use.
+        :param tools: (Optional) The tools to use.
+        :param top_logprobs: (Optional) The top log probabilities to use.
+        :param top_p: (Optional) The top p to use.
+        :param user: (Optional) The user to use.
+        :returns: An OpenAIChatCompletion.
         """
         ...
+
+
+class Inference(InferenceProvider):
+    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
+    """
+
+    @webmethod(route="/openai/v1/chat/completions", method="GET")
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """List all chat completions.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
+        :returns: A ListOpenAIChatCompletionResponse.
+        """
+        raise NotImplementedError("List chat completions is not implemented")
+
+    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        """Describe a chat completion by its ID.
+
+        :param completion_id: ID of the chat completion.
+        :returns: A OpenAICompletionWithInputMessages.
+        """
+        raise NotImplementedError("Get chat completion is not implemented")
diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py
index fb3167635..44a5e95b2 100644
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@@ -36,10 +36,25 @@ class ListRoutesResponse(BaseModel):
 @runtime_checkable
 class Inspect(Protocol):
     @webmethod(route="/inspect/routes", method="GET")
-    async def list_routes(self) -> ListRoutesResponse: ...
+    async def list_routes(self) -> ListRoutesResponse:
+        """List all routes.
+
+        :returns: A ListRoutesResponse.
+        """
+        ...
 
     @webmethod(route="/health", method="GET")
-    async def health(self) -> HealthInfo: ...
+    async def health(self) -> HealthInfo:
+        """Get the health of the service.
+
+        :returns: A HealthInfo.
+        """
+        ...
 
     @webmethod(route="/version", method="GET")
-    async def version(self) -> VersionInfo: ...
+    async def version(self) -> VersionInfo:
+        """Get the version of the service.
+
+        :returns: A VersionInfo.
+        """
+        ...
diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py
index 5d7b5aac6..3d90a92a0 100644
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@@ -29,14 +29,14 @@ class ModelType(str, Enum):
 
 @json_schema_type
 class Model(CommonModelFields, Resource):
-    type: Literal[ResourceType.model.value] = ResourceType.model.value
+    type: Literal[ResourceType.model] = ResourceType.model
 
     @property
     def model_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_model_id(self) -> str:
+    def provider_model_id(self) -> str | None:
         return self.provider_resource_id
 
     model_config = ConfigDict(protected_namespaces=())
@@ -80,16 +80,32 @@ class OpenAIListModelsResponse(BaseModel):
 @trace_protocol
 class Models(Protocol):
     @webmethod(route="/models", method="GET")
-    async def list_models(self) -> ListModelsResponse: ...
+    async def list_models(self) -> ListModelsResponse:
+        """List all models.
+
+        :returns: A ListModelsResponse.
+        """
+        ...
 
     @webmethod(route="/openai/v1/models", method="GET")
-    async def openai_list_models(self) -> OpenAIListModelsResponse: ...
+    async def openai_list_models(self) -> OpenAIListModelsResponse:
+        """List models using the OpenAI API.
+
+        :returns: A OpenAIListModelsResponse.
+        """
+        ...
 
     @webmethod(route="/models/{model_id:path}", method="GET")
     async def get_model(
         self,
         model_id: str,
-    ) -> Model: ...
+    ) -> Model:
+        """Get a model by its identifier.
+
+        :param model_id: The identifier of the model to get.
+        :returns: A Model.
+        """
+        ...
 
     @webmethod(route="/models", method="POST")
     async def register_model(
@@ -99,10 +115,25 @@ class Models(Protocol):
         provider_id: str | None = None,
         metadata: dict[str, Any] | None = None,
         model_type: ModelType | None = None,
-    ) -> Model: ...
+    ) -> Model:
+        """Register a model.
+
+        :param model_id: The identifier of the model to register.
+        :param provider_model_id: The identifier of the model in the provider.
+        :param provider_id: The identifier of the provider.
+        :param metadata: Any additional metadata for this model.
+        :param model_type: The type of model to register.
+        :returns: A Model.
+        """
+        ...
 
     @webmethod(route="/models/{model_id:path}", method="DELETE")
     async def unregister_model(
         self,
         model_id: str,
-    ) -> None: ...
+    ) -> None:
+        """Unregister a model.
+
+        :param model_id: The identifier of the model to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/post_training/post_training.py b/llama_stack/apis/post_training/post_training.py
index 016f79fce..b196c8a17 100644
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@@ -182,7 +182,19 @@ class PostTraining(Protocol):
         ),
         checkpoint_dir: str | None = None,
         algorithm_config: AlgorithmConfig | None = None,
-    ) -> PostTrainingJob: ...
+    ) -> PostTrainingJob:
+        """Run supervised fine-tuning of a model.
+
+        :param job_uuid: The UUID of the job to create.
+        :param training_config: The training configuration.
+        :param hyperparam_search_config: The hyperparam search configuration.
+        :param logger_config: The logger configuration.
+        :param model: The model to fine-tune.
+        :param checkpoint_dir: The directory to save checkpoint(s) to.
+        :param algorithm_config: The algorithm configuration.
+        :returns: A PostTrainingJob.
+        """
+        ...
 
     @webmethod(route="/post-training/preference-optimize", method="POST")
     async def preference_optimize(
@@ -193,16 +205,49 @@ class PostTraining(Protocol):
         training_config: TrainingConfig,
         hyperparam_search_config: dict[str, Any],
         logger_config: dict[str, Any],
-    ) -> PostTrainingJob: ...
+    ) -> PostTrainingJob:
+        """Run preference optimization of a model.
+
+        :param job_uuid: The UUID of the job to create.
+        :param finetuned_model: The model to fine-tune.
+        :param algorithm_config: The algorithm configuration.
+        :param training_config: The training configuration.
+        :param hyperparam_search_config: The hyperparam search configuration.
+        :param logger_config: The logger configuration.
+        :returns: A PostTrainingJob.
+        """
+        ...
 
     @webmethod(route="/post-training/jobs", method="GET")
-    async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...
+    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
+        """Get all training jobs.
+
+        :returns: A ListPostTrainingJobsResponse.
+        """
+        ...
 
     @webmethod(route="/post-training/job/status", method="GET")
-    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse: ...
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
+        """Get the status of a training job.
+
+        :param job_uuid: The UUID of the job to get the status of.
+        :returns: A PostTrainingJobStatusResponse.
+        """
+        ...
 
     @webmethod(route="/post-training/job/cancel", method="POST")
-    async def cancel_training_job(self, job_uuid: str) -> None: ...
+    async def cancel_training_job(self, job_uuid: str) -> None:
+        """Cancel a training job.
+
+        :param job_uuid: The UUID of the job to cancel.
+        """
+        ...
 
     @webmethod(route="/post-training/job/artifacts", method="GET")
-    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse: ...
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
+        """Get the artifacts of a training job.
+
+        :param job_uuid: The UUID of the job to get the artifacts of.
+        :returns: A PostTrainingJobArtifactsResponse.
+        """
+        ...
diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py
index 751c9263b..4bc977bf1 100644
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@@ -32,7 +32,18 @@ class Providers(Protocol):
     """
 
     @webmethod(route="/providers", method="GET")
-    async def list_providers(self) -> ListProvidersResponse: ...
+    async def list_providers(self) -> ListProvidersResponse:
+        """List all available providers.
+
+        :returns: A ListProvidersResponse containing information about all providers.
+        """
+        ...
 
     @webmethod(route="/providers/{provider_id}", method="GET")
-    async def inspect_provider(self, provider_id: str) -> ProviderInfo: ...
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
+        """Get detailed information about a specific provider.
+
+        :param provider_id: The ID of the provider to inspect.
+        :returns: A ProviderInfo object containing the provider's details.
+        """
+        ...
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index 70ec63c55..175baa7b9 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -4,12 +4,23 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import sys
 from enum import Enum
 
 from pydantic import BaseModel, Field
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
 
-class ResourceType(Enum):
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
+
+class ResourceType(StrEnum):
     model = "model"
     shield = "shield"
     vector_db = "vector_db"
@@ -25,9 +36,9 @@ class Resource(BaseModel):
 
     identifier: str = Field(description="Unique identifier for this resource in llama stack")
 
-    provider_resource_id: str = Field(
-        description="Unique identifier for this resource in the provider",
+    provider_resource_id: str | None = Field(
         default=None,
+        description="Unique identifier for this resource in the provider",
     )
 
     provider_id: str = Field(description="ID of the provider that owns this resource")
diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py
index e139f5ffc..3aee52b7e 100644
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@@ -53,5 +53,13 @@ class Safety(Protocol):
         self,
         shield_id: str,
         messages: list[Message],
-        params: dict[str, Any] = None,
-    ) -> RunShieldResponse: ...
+        params: dict[str, Any],
+    ) -> RunShieldResponse:
+        """Run a shield.
+
+        :param shield_id: The identifier of the shield to run.
+        :param messages: The messages to run the shield on.
+        :param params: The parameters of the shield.
+        :returns: A RunShieldResponse.
+        """
+        ...
diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py
index 414f3d5e2..732e80e79 100644
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@@ -61,7 +61,15 @@ class Scoring(Protocol):
         dataset_id: str,
         scoring_functions: dict[str, ScoringFnParams | None],
         save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse: ...
+    ) -> ScoreBatchResponse:
+        """Score a batch of rows.
+
+        :param dataset_id: The ID of the dataset to score.
+        :param scoring_functions: The scoring functions to use for the scoring.
+        :param save_results_dataset: Whether to save the results to a dataset.
+        :returns: A ScoreBatchResponse.
+        """
+        ...
 
     @webmethod(route="/scoring/score", method="POST")
     async def score(
@@ -73,6 +81,6 @@ class Scoring(Protocol):
 
         :param input_rows: The rows to score.
         :param scoring_functions: The scoring functions to use for the scoring.
-        :return: ScoreResponse object containing rows and aggregated results
+        :returns: A ScoreResponse object containing rows and aggregated results.
         """
         ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
index 6c7819965..9cd21b7d1 100644
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+# TODO: use enum.StrEnum when we drop support for python 3.10
+import sys
 from enum import Enum
 from typing import (
     Annotated,
@@ -19,18 +21,27 @@ from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
+if sys.version_info >= (3, 11):
+    from enum import StrEnum
+else:
+
+    class StrEnum(str, Enum):
+        """Backport of StrEnum for Python 3.10 and below."""
+
+        pass
+
 
 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
 # with standard metrics so they can be rolled up?
 @json_schema_type
-class ScoringFnParamsType(Enum):
+class ScoringFnParamsType(StrEnum):
     llm_as_judge = "llm_as_judge"
     regex_parser = "regex_parser"
     basic = "basic"
 
 
 @json_schema_type
-class AggregationFunctionType(Enum):
+class AggregationFunctionType(StrEnum):
     average = "average"
     weighted_average = "weighted_average"
     median = "median"
@@ -40,36 +51,36 @@ class AggregationFunctionType(Enum):
 
 @json_schema_type
 class LLMAsJudgeScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
+    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
     judge_model: str
     prompt_template: str | None = None
-    judge_score_regexes: list[str] | None = Field(
+    judge_score_regexes: list[str] = Field(
         description="Regexes to extract the answer from generated response",
-        default_factory=list,
+        default_factory=lambda: [],
     )
-    aggregation_functions: list[AggregationFunctionType] | None = Field(
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
+        default_factory=lambda: [],
     )
 
 
 @json_schema_type
 class RegexParserScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
-    parsing_regexes: list[str] | None = Field(
+    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
+    parsing_regexes: list[str] = Field(
         description="Regex to extract the answer from generated response",
-        default_factory=list,
+        default_factory=lambda: [],
     )
-    aggregation_functions: list[AggregationFunctionType] | None = Field(
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
+        default_factory=lambda: [],
     )
 
 
 @json_schema_type
 class BasicScoringFnParams(BaseModel):
-    type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value
-    aggregation_functions: list[AggregationFunctionType] | None = Field(
+    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
+    aggregation_functions: list[AggregationFunctionType] = Field(
         description="Aggregation functions to apply to the scores of each row",
         default_factory=list,
     )
@@ -99,14 +110,14 @@ class CommonScoringFnFields(BaseModel):
 
 @json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
-    type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
+    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
 
     @property
     def scoring_fn_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_scoring_fn_id(self) -> str:
+    def provider_scoring_fn_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -123,10 +134,21 @@ class ListScoringFunctionsResponse(BaseModel):
 @runtime_checkable
 class ScoringFunctions(Protocol):
     @webmethod(route="/scoring-functions", method="GET")
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
+    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
+        """List all scoring functions.
+
+        :returns: A ListScoringFunctionsResponse.
+        """
+        ...
 
     @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ...
+    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn:
+        """Get a scoring function by its ID.
+
+        :param scoring_fn_id: The ID of the scoring function to get.
+        :returns: A ScoringFn.
+        """
+        ...
 
     @webmethod(route="/scoring-functions", method="POST")
     async def register_scoring_function(
@@ -137,4 +159,14 @@ class ScoringFunctions(Protocol):
         provider_scoring_fn_id: str | None = None,
         provider_id: str | None = None,
         params: ScoringFnParams | None = None,
-    ) -> None: ...
+    ) -> None:
+        """Register a scoring function.
+
+        :param scoring_fn_id: The ID of the scoring function to register.
+        :param description: The description of the scoring function.
+        :param return_type: The return type of the scoring function.
+        :param provider_scoring_fn_id: The ID of the provider scoring function to use for the scoring function.
+        :param provider_id: The ID of the provider to use for the scoring function.
+        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
+        """
+        ...
diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py
index 4172fcbd1..ce1f73d8e 100644
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@@ -21,14 +21,14 @@ class CommonShieldFields(BaseModel):
 class Shield(CommonShieldFields, Resource):
     """A safety shield resource that can be used to check content"""
 
-    type: Literal[ResourceType.shield.value] = ResourceType.shield.value
+    type: Literal[ResourceType.shield] = ResourceType.shield
 
     @property
     def shield_id(self) -> str:
         return self.identifier
 
     @property
-    def provider_shield_id(self) -> str:
+    def provider_shield_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -46,10 +46,21 @@ class ListShieldsResponse(BaseModel):
 @trace_protocol
 class Shields(Protocol):
     @webmethod(route="/shields", method="GET")
-    async def list_shields(self) -> ListShieldsResponse: ...
+    async def list_shields(self) -> ListShieldsResponse:
+        """List all shields.
+
+        :returns: A ListShieldsResponse.
+        """
+        ...
 
     @webmethod(route="/shields/{identifier:path}", method="GET")
-    async def get_shield(self, identifier: str) -> Shield: ...
+    async def get_shield(self, identifier: str) -> Shield:
+        """Get a shield by its identifier.
+
+        :param identifier: The identifier of the shield to get.
+        :returns: A Shield.
+        """
+        ...
 
     @webmethod(route="/shields", method="POST")
     async def register_shield(
@@ -58,4 +69,13 @@ class Shields(Protocol):
         provider_shield_id: str | None = None,
         provider_id: str | None = None,
         params: dict[str, Any] | None = None,
-    ) -> Shield: ...
+    ) -> Shield:
+        """Register a shield.
+
+        :param shield_id: The identifier of the shield to register.
+        :param provider_shield_id: The identifier of the shield in the provider.
+        :param provider_id: The identifier of the provider.
+        :param params: The parameters of the shield.
+        :returns: A Shield.
+        """
+        ...
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index af0469d2b..0eb53f397 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -37,7 +37,7 @@ class Span(BaseModel):
     name: str
     start_time: datetime
     end_time: datetime | None = None
-    attributes: dict[str, Any] | None = Field(default_factory=dict)
+    attributes: dict[str, Any] | None = Field(default_factory=lambda: {})
 
     def set_attribute(self, key: str, value: Any):
         if self.attributes is None:
@@ -74,19 +74,19 @@ class EventCommon(BaseModel):
     trace_id: str
     span_id: str
     timestamp: datetime
-    attributes: dict[str, Primitive] | None = Field(default_factory=dict)
+    attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {})
 
 
 @json_schema_type
 class UnstructuredLogEvent(EventCommon):
-    type: Literal[EventType.UNSTRUCTURED_LOG.value] = EventType.UNSTRUCTURED_LOG.value
+    type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
     message: str
     severity: LogSeverity
 
 
 @json_schema_type
 class MetricEvent(EventCommon):
-    type: Literal[EventType.METRIC.value] = EventType.METRIC.value
+    type: Literal[EventType.METRIC] = EventType.METRIC
     metric: str  # this would be an enum
     value: int | float
     unit: str
@@ -131,14 +131,14 @@ class StructuredLogType(Enum):
 
 @json_schema_type
 class SpanStartPayload(BaseModel):
-    type: Literal[StructuredLogType.SPAN_START.value] = StructuredLogType.SPAN_START.value
+    type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
     name: str
     parent_span_id: str | None = None
 
 
 @json_schema_type
 class SpanEndPayload(BaseModel):
-    type: Literal[StructuredLogType.SPAN_END.value] = StructuredLogType.SPAN_END.value
+    type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
     status: SpanStatus
 
 
@@ -151,7 +151,7 @@ register_schema(StructuredLogPayload, name="StructuredLogPayload")
 
 @json_schema_type
 class StructuredLogEvent(EventCommon):
-    type: Literal[EventType.STRUCTURED_LOG.value] = EventType.STRUCTURED_LOG.value
+    type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
     payload: StructuredLogPayload
 
 
@@ -203,10 +203,61 @@ class QuerySpanTreeResponse(BaseModel):
     data: dict[str, SpanWithStatus]
 
 
+class MetricQueryType(Enum):
+    RANGE = "range"
+    INSTANT = "instant"
+
+
+class MetricLabelOperator(Enum):
+    EQUALS = "="
+    NOT_EQUALS = "!="
+    REGEX_MATCH = "=~"
+    REGEX_NOT_MATCH = "!~"
+
+
+class MetricLabelMatcher(BaseModel):
+    name: str
+    value: str
+    operator: MetricLabelOperator = MetricLabelOperator.EQUALS
+
+
+@json_schema_type
+class MetricLabel(BaseModel):
+    name: str
+    value: str
+
+
+@json_schema_type
+class MetricDataPoint(BaseModel):
+    timestamp: int
+    value: float
+
+
+@json_schema_type
+class MetricSeries(BaseModel):
+    metric: str
+    labels: list[MetricLabel]
+    values: list[MetricDataPoint]
+
+
+class QueryMetricsResponse(BaseModel):
+    data: list[MetricSeries]
+
+
 @runtime_checkable
 class Telemetry(Protocol):
     @webmethod(route="/telemetry/events", method="POST")
-    async def log_event(self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400) -> None: ...
+    async def log_event(
+        self,
+        event: Event,
+        ttl_seconds: int = DEFAULT_TTL_DAYS * 86400,
+    ) -> None:
+        """Log an event.
+
+        :param event: The event to log.
+        :param ttl_seconds: The time to live of the event.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces", method="POST")
     async def query_traces(
@@ -215,13 +266,35 @@ class Telemetry(Protocol):
         limit: int | None = 100,
         offset: int | None = 0,
         order_by: list[str] | None = None,
-    ) -> QueryTracesResponse: ...
+    ) -> QueryTracesResponse:
+        """Query traces.
+
+        :param attribute_filters: The attribute filters to apply to the traces.
+        :param limit: The limit of traces to return.
+        :param offset: The offset of the traces to return.
+        :param order_by: The order by of the traces to return.
+        :returns: A QueryTracesResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
-    async def get_trace(self, trace_id: str) -> Trace: ...
+    async def get_trace(self, trace_id: str) -> Trace:
+        """Get a trace by its ID.
+
+        :param trace_id: The ID of the trace to get.
+        :returns: A Trace.
+        """
+        ...
 
     @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
-    async def get_span(self, trace_id: str, span_id: str) -> Span: ...
+    async def get_span(self, trace_id: str, span_id: str) -> Span:
+        """Get a span by its ID.
+
+        :param trace_id: The ID of the trace to get the span from.
+        :param span_id: The ID of the span to get.
+        :returns: A Span.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST")
     async def get_span_tree(
@@ -229,7 +302,15 @@ class Telemetry(Protocol):
         span_id: str,
         attributes_to_return: list[str] | None = None,
         max_depth: int | None = None,
-    ) -> QuerySpanTreeResponse: ...
+    ) -> QuerySpanTreeResponse:
+        """Get a span tree by its ID.
+
+        :param span_id: The ID of the span to get the tree from.
+        :param attributes_to_return: The attributes to return in the tree.
+        :param max_depth: The maximum depth of the tree.
+        :returns: A QuerySpanTreeResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans", method="POST")
     async def query_spans(
@@ -237,7 +318,15 @@ class Telemetry(Protocol):
         attribute_filters: list[QueryCondition],
         attributes_to_return: list[str],
         max_depth: int | None = None,
-    ) -> QuerySpansResponse: ...
+    ) -> QuerySpansResponse:
+        """Query spans.
+
+        :param attribute_filters: The attribute filters to apply to the spans.
+        :param attributes_to_return: The attributes to return in the spans.
+        :param max_depth: The maximum depth of the tree.
+        :returns: A QuerySpansResponse.
+        """
+        ...
 
     @webmethod(route="/telemetry/spans/export", method="POST")
     async def save_spans_to_dataset(
@@ -246,4 +335,34 @@ class Telemetry(Protocol):
         attributes_to_save: list[str],
         dataset_id: str,
         max_depth: int | None = None,
-    ) -> None: ...
+    ) -> None:
+        """Save spans to a dataset.
+
+        :param attribute_filters: The attribute filters to apply to the spans.
+        :param attributes_to_save: The attributes to save to the dataset.
+        :param dataset_id: The ID of the dataset to save the spans to.
+        :param max_depth: The maximum depth of the tree.
+        """
+        ...
+
+    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST")
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: int,
+        end_time: int | None = None,
+        granularity: str | None = "1d",
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse:
+        """Query metrics.
+
+        :param metric_name: The name of the metric to query.
+        :param start_time: The start time of the metric to query.
+        :param end_time: The end time of the metric to query.
+        :param granularity: The granularity of the metric to query.
+        :param query_type: The type of query to perform.
+        :param label_matchers: The label matchers to apply to the metric.
+        :returns: A QueryMetricsResponse.
+        """
+        ...
diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py
index fdf199b1a..de3e4c62c 100644
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from typing import Annotated, Any, Literal
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from typing_extensions import Protocol, runtime_checkable
 
 from llama_stack.apis.common.content_types import URL, InterleavedContent
@@ -67,11 +67,33 @@ register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
 
 @json_schema_type
 class RAGQueryConfig(BaseModel):
+    """
+    Configuration for the RAG query generation.
+
+    :param query_generator_config: Configuration for the query generator.
+    :param max_tokens_in_context: Maximum number of tokens in the context.
+    :param max_chunks: Maximum number of chunks to retrieve.
+    :param chunk_template: Template for formatting each retrieved chunk in the context.
+        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
+        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
+    """
+
     # This config defines how a query is generated using the messages
     # for memory bank retrieval.
     query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
     max_tokens_in_context: int = 4096
     max_chunks: int = 5
+    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
+
+    @field_validator("chunk_template")
+    def validate_chunk_template(cls, v: str) -> str:
+        if "{chunk.content}" not in v:
+            raise ValueError("chunk_template must contain {chunk.content}")
+        if "{index}" not in v:
+            raise ValueError("chunk_template must contain {index}")
+        if len(v) == 0:
+            raise ValueError("chunk_template must not be empty")
+        return v
 
 
 @runtime_checkable
diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py
index eda627932..2f62b0ba1 100644
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@@ -36,7 +36,7 @@ class ToolHost(Enum):
 
 @json_schema_type
 class Tool(Resource):
-    type: Literal[ResourceType.tool.value] = ResourceType.tool.value
+    type: Literal[ResourceType.tool] = ResourceType.tool
     toolgroup_id: str
     tool_host: ToolHost
     description: str
@@ -62,7 +62,7 @@ class ToolGroupInput(BaseModel):
 
 @json_schema_type
 class ToolGroup(Resource):
-    type: Literal[ResourceType.tool_group.value] = ResourceType.tool_group.value
+    type: Literal[ResourceType.tool_group] = ResourceType.tool_group
     mcp_endpoint: URL | None = None
     args: dict[str, Any] | None = None
 
@@ -103,37 +103,65 @@ class ToolGroups(Protocol):
         mcp_endpoint: URL | None = None,
         args: dict[str, Any] | None = None,
     ) -> None:
-        """Register a tool group"""
+        """Register a tool group.
+
+        :param toolgroup_id: The ID of the tool group to register.
+        :param provider_id: The ID of the provider to use for the tool group.
+        :param mcp_endpoint: The MCP endpoint to use for the tool group.
+        :param args: A dictionary of arguments to pass to the tool group.
+        """
         ...
 
     @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
     async def get_tool_group(
         self,
         toolgroup_id: str,
-    ) -> ToolGroup: ...
+    ) -> ToolGroup:
+        """Get a tool group by its ID.
+
+        :param toolgroup_id: The ID of the tool group to get.
+        :returns: A ToolGroup.
+        """
+        ...
 
     @webmethod(route="/toolgroups", method="GET")
     async def list_tool_groups(self) -> ListToolGroupsResponse:
-        """List tool groups with optional provider"""
+        """List tool groups with optional provider.
+
+        :returns: A ListToolGroupsResponse.
+        """
         ...
 
     @webmethod(route="/tools", method="GET")
     async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
-        """List tools with optional tool group"""
+        """List tools with optional tool group.
+
+        :param toolgroup_id: The ID of the tool group to list tools for.
+        :returns: A ListToolsResponse.
+        """
         ...
 
     @webmethod(route="/tools/{tool_name:path}", method="GET")
     async def get_tool(
         self,
         tool_name: str,
-    ) -> Tool: ...
+    ) -> Tool:
+        """Get a tool by its name.
+
+        :param tool_name: The name of the tool to get.
+        :returns: A Tool.
+        """
+        ...
 
     @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
     async def unregister_toolgroup(
         self,
         toolgroup_id: str,
     ) -> None:
-        """Unregister a tool group"""
+        """Unregister a tool group.
+
+        :param toolgroup_id: The ID of the tool group to unregister.
+        """
         ...
 
 
@@ -152,9 +180,21 @@ class ToolRuntime(Protocol):
     @webmethod(route="/tool-runtime/list-tools", method="GET")
     async def list_runtime_tools(
         self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolDefsResponse: ...
+    ) -> ListToolDefsResponse:
+        """List all tools in the runtime.
+
+        :param tool_group_id: The ID of the tool group to list tools for.
+        :param mcp_endpoint: The MCP endpoint to use for the tool group.
+        :returns: A ListToolDefsResponse.
+        """
+        ...
 
     @webmethod(route="/tool-runtime/invoke", method="POST")
     async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        """Run a tool with the given arguments"""
+        """Run a tool with the given arguments.
+
+        :param tool_name: The name of the tool to invoke.
+        :param kwargs: A dictionary of arguments to pass to the tool.
+        :returns: A ToolInvocationResult.
+        """
         ...
diff --git a/llama_stack/apis/vector_dbs/vector_dbs.py b/llama_stack/apis/vector_dbs/vector_dbs.py
index 6224566cd..405852476 100644
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@@ -15,7 +15,7 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 
 @json_schema_type
 class VectorDB(Resource):
-    type: Literal[ResourceType.vector_db.value] = ResourceType.vector_db.value
+    type: Literal[ResourceType.vector_db] = ResourceType.vector_db
 
     embedding_model: str
     embedding_dimension: int
@@ -25,7 +25,7 @@ class VectorDB(Resource):
         return self.identifier
 
     @property
-    def provider_vector_db_id(self) -> str:
+    def provider_vector_db_id(self) -> str | None:
         return self.provider_resource_id
 
 
@@ -44,13 +44,24 @@ class ListVectorDBsResponse(BaseModel):
 @trace_protocol
 class VectorDBs(Protocol):
     @webmethod(route="/vector-dbs", method="GET")
-    async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
+    async def list_vector_dbs(self) -> ListVectorDBsResponse:
+        """List all vector databases.
+
+        :returns: A ListVectorDBsResponse.
+        """
+        ...
 
     @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
     async def get_vector_db(
         self,
         vector_db_id: str,
-    ) -> VectorDB: ...
+    ) -> VectorDB:
+        """Get a vector database by its identifier.
+
+        :param vector_db_id: The identifier of the vector database to get.
+        :returns: A VectorDB.
+        """
+        ...
 
     @webmethod(route="/vector-dbs", method="POST")
     async def register_vector_db(
@@ -60,7 +71,22 @@ class VectorDBs(Protocol):
         embedding_dimension: int | None = 384,
         provider_id: str | None = None,
         provider_vector_db_id: str | None = None,
-    ) -> VectorDB: ...
+    ) -> VectorDB:
+        """Register a vector database.
+
+        :param vector_db_id: The identifier of the vector database to register.
+        :param embedding_model: The embedding model to use.
+        :param embedding_dimension: The dimension of the embedding model.
+        :param provider_id: The identifier of the provider.
+        :param provider_vector_db_id: The identifier of the vector database in the provider.
+        :returns: A VectorDB.
+        """
+        ...
 
     @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
-    async def unregister_vector_db(self, vector_db_id: str) -> None: ...
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        """Unregister a vector database.
+
+        :param vector_db_id: The identifier of the vector database to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py
index bfae0f802..3ac62d42c 100644
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@@ -46,7 +46,14 @@ class VectorIO(Protocol):
         vector_db_id: str,
         chunks: list[Chunk],
         ttl_seconds: int | None = None,
-    ) -> None: ...
+    ) -> None:
+        """Insert chunks into a vector database.
+
+        :param vector_db_id: The identifier of the vector database to insert the chunks into.
+        :param chunks: The chunks to insert.
+        :param ttl_seconds: The time to live of the chunks.
+        """
+        ...
 
     @webmethod(route="/vector-io/query", method="POST")
     async def query_chunks(
@@ -54,4 +61,12 @@ class VectorIO(Protocol):
         vector_db_id: str,
         query: InterleavedContent,
         params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse: ...
+    ) -> QueryChunksResponse:
+        """Query chunks from a vector database.
+
+        :param vector_db_id: The identifier of the vector database to query.
+        :param query: The query to search for.
+        :param params: The parameters of the query.
+        :returns: A QueryChunksResponse.
+        """
+        ...
diff --git a/llama_stack/cli/llama.py b/llama_stack/cli/llama.py
index 8ff580029..433b311e7 100644
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@@ -38,7 +38,10 @@ class LlamaCLIParser:
         print_subcommand_description(self.parser, subparsers)
 
     def parse_args(self) -> argparse.Namespace:
-        return self.parser.parse_args()
+        args = self.parser.parse_args()
+        if not isinstance(args, argparse.Namespace):
+            raise TypeError(f"Expected argparse.Namespace, got {type(args)}")
+        return args
 
     def run(self, args: argparse.Namespace) -> None:
         args.func(args)
diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index ae4a39ce2..4c6f82d1b 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -12,6 +12,7 @@ import shutil
 import sys
 import textwrap
 from functools import lru_cache
+from importlib.abc import Traversable
 from pathlib import Path
 
 import yaml
@@ -36,7 +37,8 @@ from llama_stack.distribution.datatypes import (
 )
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.resolver import InvalidProviderError
-from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+from llama_stack.distribution.stack import replace_env_vars
+from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.exec import formulate_run_args, run_command
 from llama_stack.distribution.utils.image_types import LlamaStackImageType
@@ -202,7 +204,11 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
     else:
         with open(args.config) as f:
             try:
-                build_config = BuildConfig(**yaml.safe_load(f))
+                contents = yaml.safe_load(f)
+                contents = replace_env_vars(contents)
+                build_config = BuildConfig(**contents)
+                if args.image_type:
+                    build_config.image_type = args.image_type
             except Exception as e:
                 cprint(
                     f"Could not parse config file {args.config}: {e}",
@@ -245,11 +251,12 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
         sys.exit(1)
 
     if args.run:
-        run_config = Path(run_config)
         config_dict = yaml.safe_load(run_config.read_text())
         config = parse_and_maybe_upgrade_config(config_dict)
+        if not os.path.exists(config.external_providers_dir):
+            os.makedirs(config.external_providers_dir, exist_ok=True)
         run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
-        run_args.extend([run_config, str(os.getenv("LLAMA_STACK_PORT", 8321))])
+        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", run_config])
         run_command(run_args)
 
 
@@ -257,7 +264,7 @@ def _generate_run_config(
     build_config: BuildConfig,
     build_dir: Path,
     image_name: str,
-) -> str:
+) -> Path:
     """
     Generate a run.yaml template file for user to edit from a build.yaml file
     """
@@ -267,7 +274,9 @@ def _generate_run_config(
         image_name=image_name,
         apis=apis,
         providers={},
-        external_providers_dir=build_config.external_providers_dir if build_config.external_providers_dir else None,
+        external_providers_dir=build_config.external_providers_dir
+        if build_config.external_providers_dir
+        else EXTERNAL_PROVIDERS_DIR,
     )
     # build providers dict
     provider_registry = get_provider_registry(build_config)
@@ -334,7 +343,7 @@ def _run_stack_build_command_from_build_config(
     image_name: str | None = None,
     template_name: str | None = None,
     config_path: str | None = None,
-) -> str:
+) -> Path | Traversable:
     image_name = image_name or build_config.image_name
     if build_config.image_type == LlamaStackImageType.CONTAINER.value:
         if template_name:
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index 93e7d9b22..2c402beeb 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -49,7 +49,7 @@ class StackBuild(Subcommand):
             type=str,
             help="Image Type to use for the build. If not specified, will use the image type from the template config.",
             choices=[e.value for e in ImageType],
-            default=ImageType.CONDA.value,
+            default=None,  # no default so we can detect if a user specified --image-type and override image_type in the config
         )
 
         self.parser.add_argument(
diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py
index bfe11aa2c..deebd937b 100644
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@@ -46,7 +46,7 @@ class StackListProviders(Subcommand):
         else:
             providers = [(k.value, prov) for k, prov in all_providers.items()]
 
-        providers = [p for api, p in providers if api in self.providable_apis]
+        providers = [(api, p) for api, p in providers if api in self.providable_apis]
 
         # eventually, this should query a registry at llama.meta.com/llamastack/distributions
         headers = [
@@ -57,7 +57,7 @@ class StackListProviders(Subcommand):
 
         rows = []
 
-        specs = [spec for p in providers for spec in p.values()]
+        specs = [spec for api, p in providers for spec in p.values()]
         for spec in specs:
             if spec.is_sample:
                 continue
@@ -65,7 +65,7 @@ class StackListProviders(Subcommand):
                 [
                     spec.api.value,
                     spec.provider_type,
-                    ",".join(spec.pip_packages),
+                    ",".join(spec.pip_packages) if hasattr(spec, "pip_packages") else "",
                 ]
             )
         print_table(
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index 2eee20883..4a44e0366 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -33,7 +33,8 @@ class StackRun(Subcommand):
         self.parser.add_argument(
             "config",
             type=str,
-            help="Path to config file to use for the run",
+            nargs="?",  # Make it optional
+            help="Path to config file to use for the run. Required for venv and conda environments.",
         )
         self.parser.add_argument(
             "--port",
@@ -47,28 +48,12 @@ class StackRun(Subcommand):
             default=os.environ.get("CONDA_DEFAULT_ENV"),
             help="Name of the image to run. Defaults to the current environment",
         )
-        self.parser.add_argument(
-            "--disable-ipv6",
-            action="store_true",
-            help="Disable IPv6 support",
-            default=False,
-        )
         self.parser.add_argument(
             "--env",
             action="append",
             help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
             metavar="KEY=VALUE",
         )
-        self.parser.add_argument(
-            "--tls-keyfile",
-            type=str,
-            help="Path to TLS key file for HTTPS",
-        )
-        self.parser.add_argument(
-            "--tls-certfile",
-            type=str,
-            help="Path to TLS certificate file for HTTPS",
-        )
         self.parser.add_argument(
             "--image-type",
             type=str,
@@ -98,44 +83,55 @@ class StackRun(Subcommand):
         from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
         from llama_stack.distribution.utils.exec import formulate_run_args, run_command
 
-        config_file = Path(args.config)
-        has_yaml_suffix = args.config.endswith(".yaml")
-        template_name = None
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if this is a template
-            config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
-            if config_file.exists():
-                template_name = args.config
-
-        if not config_file.exists() and not has_yaml_suffix:
-            # check if it's a build config saved to ~/.llama dir
-            config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
-
-        if not config_file.exists():
-            self.parser.error(
-                f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
-            )
-
-        if not config_file.is_file():
-            self.parser.error(
-                f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
-            )
-
-        logger.info(f"Using run configuration: {config_file}")
-
-        try:
-            config_dict = yaml.safe_load(config_file.read_text())
-        except yaml.parser.ParserError as e:
-            self.parser.error(f"failed to load config file '{config_file}':\n {e}")
-
-        try:
-            config = parse_and_maybe_upgrade_config(config_dict)
-        except AttributeError as e:
-            self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
-
         image_type, image_name = self._get_image_type_and_name(args)
 
+        # Check if config is required based on image type
+        if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not args.config:
+            self.parser.error("Config file is required for venv and conda environments")
+
+        if args.config:
+            config_file = Path(args.config)
+            has_yaml_suffix = args.config.endswith(".yaml")
+            template_name = None
+
+            if not config_file.exists() and not has_yaml_suffix:
+                # check if this is a template
+                config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
+                if config_file.exists():
+                    template_name = args.config
+
+            if not config_file.exists() and not has_yaml_suffix:
+                # check if it's a build config saved to ~/.llama dir
+                config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
+
+            if not config_file.exists():
+                self.parser.error(
+                    f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
+                )
+
+            if not config_file.is_file():
+                self.parser.error(
+                    f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
+                )
+
+            logger.info(f"Using run configuration: {config_file}")
+
+            try:
+                config_dict = yaml.safe_load(config_file.read_text())
+            except yaml.parser.ParserError as e:
+                self.parser.error(f"failed to load config file '{config_file}':\n {e}")
+
+            try:
+                config = parse_and_maybe_upgrade_config(config_dict)
+                if not os.path.exists(str(config.external_providers_dir)):
+                    os.makedirs(str(config.external_providers_dir), exist_ok=True)
+            except AttributeError as e:
+                self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
+        else:
+            config = None
+            config_file = None
+            template_name = None
+
         # If neither image type nor image name is provided, assume the server should be run directly
         # using the current environment packages.
         if not image_type and not image_name:
@@ -157,9 +153,10 @@ class StackRun(Subcommand):
         else:
             run_args = formulate_run_args(image_type, image_name, config, template_name)
 
-            run_args.extend([str(config_file), str(args.port)])
-            if args.disable_ipv6:
-                run_args.append("--disable-ipv6")
+            run_args.extend([str(args.port)])
+
+            if config_file:
+                run_args.extend(["--config", str(config_file)])
 
             if args.env:
                 for env_var in args.env:
@@ -172,6 +169,4 @@ class StackRun(Subcommand):
                         return
                     run_args.extend(["--env", f"{key}={value}"])
 
-            if args.tls_keyfile and args.tls_certfile:
-                run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
             run_command(run_args)
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index ad316d45e..c128729e1 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -154,6 +154,12 @@ get_python_cmd() {
     fi
 }
 
+# Add other required item commands generic to all containers
+add_to_container << EOF
+# Allows running as non-root user
+RUN mkdir -p /.llama/providers.d /.cache
+EOF
+
 if [ -n "$run_config" ]; then
   # Copy the run config to the build context since it's an absolute path
   cp "$run_config" "$BUILD_CONTEXT_DIR/run.yaml"
@@ -166,17 +172,19 @@ EOF
   # and update the configuration to reference the new container path
   python_cmd=$(get_python_cmd)
   external_providers_dir=$($python_cmd -c "import yaml; config = yaml.safe_load(open('$run_config')); print(config.get('external_providers_dir') or '')")
-  if [ -n "$external_providers_dir" ]; then
+  external_providers_dir=$(eval echo "$external_providers_dir")
+  if [ -n "$external_providers_dir" ] && [ -d "$external_providers_dir" ]; then
     echo "Copying external providers directory: $external_providers_dir"
+    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
     add_to_container << EOF
-COPY $external_providers_dir /app/providers.d
+COPY providers.d /.llama/providers.d
 EOF
-    # Edit the run.yaml file to change the external_providers_dir to /app/providers.d
+    # Edit the run.yaml file to change the external_providers_dir to /.llama/providers.d
     if [ "$(uname)" = "Darwin" ]; then
-      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      sed -i.bak -e 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
       rm -f "$BUILD_CONTEXT_DIR/run.yaml.bak"
     else
-      sed -i 's|external_providers_dir:.*|external_providers_dir: /app/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
+      sed -i 's|external_providers_dir:.*|external_providers_dir: /.llama/providers.d|' "$BUILD_CONTEXT_DIR/run.yaml"
     fi
   fi
 fi
@@ -255,9 +263,6 @@ fi
 # Add other require item commands genearic to all containers
 add_to_container << EOF
 
-# Allows running as non-root user
-RUN mkdir -p /.llama /.cache
-
 RUN chmod -R g+rw /app /.llama /.cache
 EOF
 
diff --git a/llama_stack/distribution/configure.py b/llama_stack/distribution/configure.py
index 76167258a..e58ea0338 100644
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@@ -17,6 +17,7 @@ from llama_stack.distribution.distribution import (
     builtin_automatically_routed_apis,
     get_provider_registry,
 )
+from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
 from llama_stack.providers.datatypes import Api, ProviderSpec
@@ -73,11 +74,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
 
         existing_providers = config.providers.get(api_str, [])
         if existing_providers:
-            logger.info(
-                f"Re-configuring existing providers for API `{api_str}`...",
-                "green",
-                attrs=["bold"],
-            )
+            logger.info(f"Re-configuring existing providers for API `{api_str}`...")
             updated_providers = []
             for p in existing_providers:
                 logger.info(f"> Configuring provider `({p.provider_type})`")
@@ -91,7 +88,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
             if not plist:
                 raise ValueError(f"No provider configured for API {api_str}?")
 
-            logger.info(f"Configuring API `{api_str}`...", "green", attrs=["bold"])
+            logger.info(f"Configuring API `{api_str}`...")
             updated_providers = []
             for i, provider_type in enumerate(plist):
                 if i >= 1:
@@ -174,4 +171,7 @@ def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfi
 
     config_dict["version"] = LLAMA_STACK_RUN_CONFIG_VERSION
 
+    if not config_dict.get("external_providers_dir", None):
+        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR
+
     return StackRunConfig(**config_dict)
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index c127136c9..446a88ca0 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -5,9 +5,10 @@
 # the root directory of this source tree.
 
 from enum import Enum
+from pathlib import Path
 from typing import Annotated, Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
 from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
 from llama_stack.apis.datasetio import DatasetIO
@@ -249,10 +250,18 @@ class ServerConfig(BaseModel):
         default=None,
         description="Path to TLS key file for HTTPS",
     )
+    tls_cafile: str | None = Field(
+        default=None,
+        description="Path to TLS CA file for HTTPS with mutual TLS authentication",
+    )
     auth: AuthenticationConfig | None = Field(
         default=None,
         description="Authentication configuration for the server",
     )
+    host: str | None = Field(
+        default=None,
+        description="The host the server should listen on",
+    )
 
 
 class StackRunConfig(BaseModel):
@@ -304,11 +313,20 @@ a default SQLite store will be used.""",
         description="Configuration for the HTTP(S) server",
     )
 
-    external_providers_dir: str | None = Field(
+    external_providers_dir: Path | None = Field(
         default=None,
         description="Path to directory containing external provider implementations. The providers code and dependencies must be installed on the system.",
     )
 
+    @field_validator("external_providers_dir")
+    @classmethod
+    def validate_external_providers_dir(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return Path(v)
+        return v
+
 
 class BuildConfig(BaseModel):
     version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
@@ -322,8 +340,17 @@ class BuildConfig(BaseModel):
         default=None,
         description="Name of the distribution to build",
     )
-    external_providers_dir: str | None = Field(
+    external_providers_dir: Path | None = Field(
         default=None,
         description="Path to directory containing external provider implementations. The providers packages will be resolved from this directory. "
         "pip_packages MUST contain the provider package name.",
     )
+
+    @field_validator("external_providers_dir")
+    @classmethod
+    def validate_external_providers_dir(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return Path(v)
+        return v
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 07a91478a..b860d15ab 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -145,7 +145,7 @@ def get_provider_registry(
 
     # Check if config has the external_providers_dir attribute
     if config and hasattr(config, "external_providers_dir") and config.external_providers_dir:
-        external_providers_dir = os.path.abspath(config.external_providers_dir)
+        external_providers_dir = os.path.abspath(os.path.expanduser(config.external_providers_dir))
         if not os.path.exists(external_providers_dir):
             raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
         logger.info(f"Loading external providers from {external_providers_dir}")
diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index b2d16d74c..34e041cbe 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -30,7 +30,7 @@ from termcolor import cprint
 
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-from llama_stack.distribution.datatypes import Api
+from llama_stack.distribution.datatypes import Api, BuildConfig, DistributionSpec
 from llama_stack.distribution.request_headers import (
     PROVIDER_DATA_VAR,
     request_provider_data_context,
@@ -216,7 +216,19 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                 "yellow",
             )
             if self.config_path_or_template_name.endswith(".yaml"):
-                print_pip_install_help(self.config.providers)
+                # Convert Provider objects to their types
+                provider_types: dict[str, str | list[str]] = {}
+                for api, providers in self.config.providers.items():
+                    types = [p.provider_type for p in providers]
+                    # Convert single-item lists to strings
+                    provider_types[api] = types[0] if len(types) == 1 else types
+                build_config = BuildConfig(
+                    distribution_spec=DistributionSpec(
+                        providers=provider_types,
+                    ),
+                    external_providers_dir=self.config.external_providers_dir,
+                )
+                print_pip_install_help(build_config)
             else:
                 prefix = "!" if in_notebook() else ""
                 cprint(
diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py
index 157fd1e0e..29b7109dd 100644
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
@@ -99,7 +99,7 @@ class ProviderImpl(Providers):
             try:
                 health = await asyncio.wait_for(impl.health(), timeout=timeout)
                 return api_name, health
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 return (
                     api_name,
                     HealthResponse(
diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py
index bc15776ec..b03d2dee8 100644
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@@ -44,7 +44,8 @@ class RequestProviderDataContext(AbstractContextManager):
 class NeedsRequestProviderData:
     def get_request_provider_data(self) -> Any:
         spec = self.__provider_spec__
-        assert spec, f"Provider spec not set on {self.__class__}"
+        if not spec:
+            raise ValueError(f"Provider spec not set on {self.__class__}")
 
         provider_type = spec.provider_type
         validator_class = spec.provider_data_validator
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 37588ea64..257c495c3 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -13,7 +13,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
@@ -83,6 +83,13 @@ def api_protocol_map() -> dict[Api, Any]:
     }
 
 
+def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
+    return {
+        **api_protocol_map(),
+        Api.inference: InferenceProvider,
+    }
+
+
 def additional_protocols_map() -> dict[Api, Any]:
     return {
         Api.inference: (ModelsProtocolPrivate, Models, Api.models),
@@ -302,9 +309,6 @@ async def instantiate_provider(
     inner_impls: dict[str, Any],
     dist_registry: DistributionRegistry,
 ):
-    protocols = api_protocol_map()
-    additional_protocols = additional_protocols_map()
-
     provider_spec = provider.spec
     if not hasattr(provider_spec, "module"):
         raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
@@ -342,6 +346,8 @@ async def instantiate_provider(
     impl.__provider_spec__ = provider_spec
     impl.__provider_config__ = config
 
+    protocols = api_protocol_map_for_compliance_check()
+    additional_protocols = additional_protocols_map()
     # TODO: check compliance for special tool groups
     # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
     check_protocol_compliance(impl, protocols[provider_spec.api])
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 737a384d8..371d34904 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -573,6 +573,12 @@ class InferenceRouter(Inference):
             for tool in tools:
                 TypeAdapter(OpenAIChatCompletionToolParam).validate_python(tool)
 
+        # Some providers make tool calls even when tool_choice is "none"
+        # so just clear them both out to avoid unexpected tool calls
+        if tool_choice == "none" and tools is not None:
+            tool_choice = None
+            tools = None
+
         params = dict(
             model=model_obj.identifier,
             messages=messages,
@@ -600,7 +606,19 @@ class InferenceRouter(Inference):
         )
 
         provider = self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_chat_completion(**params)
+        if stream:
+            return await provider.openai_chat_completion(**params)
+        else:
+            return await self._nonstream_openai_chat_completion(provider, params)
+
+    async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion:
+        response = await provider.openai_chat_completion(**params)
+        for choice in response.choices:
+            # some providers return an empty list for no tool calls in non-streaming responses
+            # but the OpenAI API returns None. So, set tool_calls to None if it's empty
+            if choice.message and choice.message.tool_calls is not None and len(choice.message.tool_calls) == 0:
+                choice.message.tool_calls = None
+        return response
 
     async def health(self) -> dict[str, HealthResponse]:
         health_statuses = {}
@@ -612,7 +630,7 @@ class InferenceRouter(Inference):
                     continue
                 health = await asyncio.wait_for(impl.health(), timeout=timeout)
                 health_statuses[provider_id] = health
-            except asyncio.TimeoutError:
+            except (asyncio.TimeoutError, TimeoutError):
                 health_statuses[provider_id] = HealthResponse(
                     status=HealthStatus.ERROR,
                     message=f"Health check timed out after {timeout} seconds",
diff --git a/llama_stack/distribution/server/auth.py b/llama_stack/distribution/server/auth.py
index 429232ece..83436c51f 100644
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@@ -93,7 +93,7 @@ class AuthenticationMiddleware:
 
             # Validate token and get access attributes
             try:
-                access_attributes = await self.auth_provider.validate_token(token, scope)
+                validation_result = await self.auth_provider.validate_token(token, scope)
             except httpx.TimeoutException:
                 logger.exception("Authentication request timed out")
                 return await self._send_auth_error(send, "Authentication service timeout")
@@ -105,17 +105,20 @@ class AuthenticationMiddleware:
                 return await self._send_auth_error(send, "Authentication service error")
 
             # Store attributes in request scope for access control
-            if access_attributes:
-                user_attributes = access_attributes.model_dump(exclude_none=True)
+            if validation_result.access_attributes:
+                user_attributes = validation_result.access_attributes.model_dump(exclude_none=True)
             else:
                 logger.warning("No access attributes, setting namespace to token by default")
                 user_attributes = {
-                    "namespaces": [token],
+                    "roles": [token],
                 }
 
             # Store attributes in request scope
             scope["user_attributes"] = user_attributes
-            logger.debug(f"Authentication successful: {len(scope['user_attributes'])} attributes")
+            scope["principal"] = validation_result.principal
+            logger.debug(
+                f"Authentication successful: {validation_result.principal} with {len(scope['user_attributes'])} attributes"
+            )
 
         return await self.app(scope, receive, send)
 
diff --git a/llama_stack/distribution/server/auth_providers.py b/llama_stack/distribution/server/auth_providers.py
index 1b19f8923..4065a65f3 100644
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@@ -5,12 +5,14 @@
 # the root directory of this source tree.
 
 import json
+import time
 from abc import ABC, abstractmethod
 from enum import Enum
 from urllib.parse import parse_qs
 
 import httpx
-from pydantic import BaseModel, Field
+from jose import jwt
+from pydantic import BaseModel, Field, field_validator
 
 from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.log import get_logger
@@ -18,9 +20,11 @@ from llama_stack.log import get_logger
 logger = get_logger(name=__name__, category="auth")
 
 
-class AuthResponse(BaseModel):
-    """The format of the authentication response from the auth endpoint."""
-
+class TokenValidationResult(BaseModel):
+    principal: str | None = Field(
+        default=None,
+        description="The principal (username or persistent identifier) of the authenticated user",
+    )
     access_attributes: AccessAttributes | None = Field(
         default=None,
         description="""
@@ -43,6 +47,10 @@ class AuthResponse(BaseModel):
         """,
     )
 
+
+class AuthResponse(TokenValidationResult):
+    """The format of the authentication response from the auth endpoint."""
+
     message: str | None = Field(
         default=None, description="Optional message providing additional context about the authentication result."
     )
@@ -69,6 +77,7 @@ class AuthProviderType(str, Enum):
 
     KUBERNETES = "kubernetes"
     CUSTOM = "custom"
+    OAUTH2_TOKEN = "oauth2_token"
 
 
 class AuthProviderConfig(BaseModel):
@@ -82,7 +91,7 @@ class AuthProvider(ABC):
     """Abstract base class for authentication providers."""
 
     @abstractmethod
-    async def validate_token(self, token: str, scope: dict | None = None) -> AccessAttributes | None:
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         """Validate a token and return access attributes."""
         pass
 
@@ -92,12 +101,16 @@ class AuthProvider(ABC):
         pass
 
 
+class KubernetesAuthProviderConfig(BaseModel):
+    api_server_url: str
+    ca_cert_path: str | None = None
+
+
 class KubernetesAuthProvider(AuthProvider):
     """Kubernetes authentication provider that validates tokens against the Kubernetes API server."""
 
-    def __init__(self, config: dict[str, str]):
-        self.api_server_url = config["api_server_url"]
-        self.ca_cert_path = config.get("ca_cert_path")
+    def __init__(self, config: KubernetesAuthProviderConfig):
+        self.config = config
         self._client = None
 
     async def _get_client(self):
@@ -110,16 +123,16 @@ class KubernetesAuthProvider(AuthProvider):
 
             # Configure the client
             configuration = client.Configuration()
-            configuration.host = self.api_server_url
-            if self.ca_cert_path:
-                configuration.ssl_ca_cert = self.ca_cert_path
-            configuration.verify_ssl = bool(self.ca_cert_path)
+            configuration.host = self.config.api_server_url
+            if self.config.ca_cert_path:
+                configuration.ssl_ca_cert = self.config.ca_cert_path
+            configuration.verify_ssl = bool(self.config.ca_cert_path)
 
             # Create API client
             self._client = ApiClient(configuration)
         return self._client
 
-    async def validate_token(self, token: str, scope: dict | None = None) -> AccessAttributes | None:
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         """Validate a Kubernetes token and return access attributes."""
         try:
             client = await self._get_client()
@@ -146,9 +159,12 @@ class KubernetesAuthProvider(AuthProvider):
             username = payload.get("sub", "")
             groups = payload.get("groups", [])
 
-            return AccessAttributes(
-                roles=[username],  # Use username as a role
-                teams=groups,  # Use Kubernetes groups as teams
+            return TokenValidationResult(
+                principal=username,
+                access_attributes=AccessAttributes(
+                    roles=[username],  # Use username as a role
+                    teams=groups,  # Use Kubernetes groups as teams
+                ),
             )
 
         except Exception as e:
@@ -162,18 +178,125 @@ class KubernetesAuthProvider(AuthProvider):
             self._client = None
 
 
+def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> AccessAttributes:
+    attributes = AccessAttributes()
+    for claim_key, attribute_key in mapping.items():
+        if claim_key not in claims or not hasattr(attributes, attribute_key):
+            continue
+        claim = claims[claim_key]
+        if isinstance(claim, list):
+            values = claim
+        else:
+            values = claim.split()
+
+        current = getattr(attributes, attribute_key)
+        if current:
+            current.extend(values)
+        else:
+            setattr(attributes, attribute_key, values)
+    return attributes
+
+
+class OAuth2TokenAuthProviderConfig(BaseModel):
+    # The JWKS URI for collecting public keys
+    jwks_uri: str
+    cache_ttl: int = 3600
+    audience: str = "llama-stack"
+    claims_mapping: dict[str, str] = Field(
+        default_factory=lambda: {
+            "sub": "roles",
+            "username": "roles",
+            "groups": "teams",
+            "team": "teams",
+            "project": "projects",
+            "tenant": "namespaces",
+            "namespace": "namespaces",
+        },
+    )
+
+    @classmethod
+    @field_validator("claims_mapping")
+    def validate_claims_mapping(cls, v):
+        for key, value in v.items():
+            if not value:
+                raise ValueError(f"claims_mapping value cannot be empty: {key}")
+            if value not in AccessAttributes.model_fields:
+                raise ValueError(f"claims_mapping value is not a valid attribute: {value}")
+        return v
+
+
+class OAuth2TokenAuthProvider(AuthProvider):
+    """
+    JWT token authentication provider that validates a JWT token and extracts access attributes.
+
+    This should be the standard authentication provider for most use cases.
+    """
+
+    def __init__(self, config: OAuth2TokenAuthProviderConfig):
+        self.config = config
+        self._jwks_at: float = 0.0
+        self._jwks: dict[str, str] = {}
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
+        """Validate a token using the JWT token."""
+        await self._refresh_jwks()
+
+        try:
+            header = jwt.get_unverified_header(token)
+            kid = header["kid"]
+            if kid not in self._jwks:
+                raise ValueError(f"Unknown key ID: {kid}")
+            key_data = self._jwks[kid]
+            algorithm = header.get("alg", "RS256")
+            claims = jwt.decode(
+                token,
+                key_data,
+                algorithms=[algorithm],
+                audience=self.config.audience,
+                options={"verify_exp": True},
+            )
+        except Exception as exc:
+            raise ValueError(f"Invalid JWT token: {token}") from exc
+
+        # There are other standard claims, the most relevant of which is `scope`.
+        # We should incorporate these into the access attributes.
+        principal = claims["sub"]
+        access_attributes = get_attributes_from_claims(claims, self.config.claims_mapping)
+        return TokenValidationResult(
+            principal=principal,
+            access_attributes=access_attributes,
+        )
+
+    async def close(self):
+        """Close the HTTP client."""
+
+    async def _refresh_jwks(self) -> None:
+        if time.time() - self._jwks_at > self.config.cache_ttl:
+            async with httpx.AsyncClient() as client:
+                res = await client.get(self.config.jwks_uri, timeout=5)
+                res.raise_for_status()
+                jwks_data = res.json()["keys"]
+                self._jwks = {}
+                for k in jwks_data:
+                    kid = k["kid"]
+                    # Store the entire key object as it may be needed for different algorithms
+                    self._jwks[kid] = k
+                self._jwks_at = time.time()
+
+
+class CustomAuthProviderConfig(BaseModel):
+    endpoint: str
+
+
 class CustomAuthProvider(AuthProvider):
     """Custom authentication provider that uses an external endpoint."""
 
-    def __init__(self, config: dict[str, str]):
-        self.endpoint = config["endpoint"]
+    def __init__(self, config: CustomAuthProviderConfig):
+        self.config = config
         self._client = None
 
-    async def validate_token(self, token: str, scope: dict | None = None) -> AccessAttributes | None:
+    async def validate_token(self, token: str, scope: dict | None = None) -> TokenValidationResult:
         """Validate a token using the custom authentication endpoint."""
-        if not self.endpoint:
-            raise ValueError("Authentication endpoint not configured")
-
         if scope is None:
             scope = {}
 
@@ -202,7 +325,7 @@ class CustomAuthProvider(AuthProvider):
         try:
             async with httpx.AsyncClient() as client:
                 response = await client.post(
-                    self.endpoint,
+                    self.config.endpoint,
                     json=auth_request.model_dump(),
                     timeout=10.0,  # Add a reasonable timeout
                 )
@@ -214,19 +337,7 @@ class CustomAuthProvider(AuthProvider):
                 try:
                     response_data = response.json()
                     auth_response = AuthResponse(**response_data)
-
-                    # Store attributes in request scope for access control
-                    if auth_response.access_attributes:
-                        return auth_response.access_attributes
-                    else:
-                        logger.warning("No access attributes, setting namespace to api_key by default")
-                        user_attributes = {
-                            "namespaces": [token],
-                        }
-
-                    scope["user_attributes"] = user_attributes
-                    logger.debug(f"Authentication successful: {len(user_attributes)} attributes")
-                    return auth_response.access_attributes
+                    return auth_response
                 except Exception as e:
                     logger.exception("Error parsing authentication response")
                     raise ValueError("Invalid authentication response format") from e
@@ -253,9 +364,11 @@ def create_auth_provider(config: AuthProviderConfig) -> AuthProvider:
     provider_type = config.provider_type.lower()
 
     if provider_type == "kubernetes":
-        return KubernetesAuthProvider(config.config)
+        return KubernetesAuthProvider(KubernetesAuthProviderConfig.model_validate(config.config))
     elif provider_type == "custom":
-        return CustomAuthProvider(config.config)
+        return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
+    elif provider_type == "oauth2_token":
+        return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
     else:
         supported_providers = ", ".join([t.value for t in AuthProviderType])
         raise ValueError(f"Unsupported auth provider type: {provider_type}. Supported types are: {supported_providers}")
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 0c8c70306..15a8058ae 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -9,6 +9,7 @@ import asyncio
 import inspect
 import json
 import os
+import ssl
 import sys
 import traceback
 import warnings
@@ -17,6 +18,7 @@ from importlib.metadata import version as parse_version
 from pathlib import Path
 from typing import Annotated, Any
 
+import rich.pretty
 import yaml
 from fastapi import Body, FastAPI, HTTPException, Request
 from fastapi import Path as FastapiPath
@@ -114,7 +116,7 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
         return HTTPException(status_code=400, detail=str(exc))
     elif isinstance(exc, PermissionError):
         return HTTPException(status_code=403, detail=f"Permission denied: {str(exc)}")
-    elif isinstance(exc, TimeoutError):
+    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
         return HTTPException(status_code=504, detail=f"Operation timed out: {str(exc)}")
     elif isinstance(exc, NotImplementedError):
         return HTTPException(status_code=501, detail=f"Not implemented: {str(exc)}")
@@ -139,7 +141,7 @@ async def shutdown(app):
                 await asyncio.wait_for(impl.shutdown(), timeout=5)
             else:
                 logger.warning("No shutdown method for %s", impl_name)
-        except asyncio.TimeoutError:
+        except (asyncio.TimeoutError, TimeoutError):
             logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
         except (Exception, asyncio.CancelledError) as e:
             logger.exception("Failed to shutdown %s: %s", impl_name, {e})
@@ -186,11 +188,30 @@ async def sse_generator(event_gen_coroutine):
         )
 
 
+async def log_request_pre_validation(request: Request):
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body_bytes = await request.body()
+            if body_bytes:
+                try:
+                    parsed_body = json.loads(body_bytes.decode())
+                    log_output = rich.pretty.pretty_repr(parsed_body)
+                except (json.JSONDecodeError, UnicodeDecodeError):
+                    log_output = repr(body_bytes)
+                logger.debug(f"Incoming raw request body for {request.method} {request.url.path}:\n{log_output}")
+            else:
+                logger.debug(f"Incoming {request.method} {request.url.path} request with empty body.")
+        except Exception as e:
+            logger.warning(f"Could not read or log request body for {request.method} {request.url.path}: {e}")
+
+
 def create_dynamic_typed_route(func: Any, method: str, route: str):
     async def endpoint(request: Request, **kwargs):
         # Get auth attributes from the request scope
         user_attributes = request.scope.get("user_attributes", {})
 
+        await log_request_pre_validation(request)
+
         # Use context manager with both provider data and auth attributes
         with request_provider_data_context(request.headers, user_attributes):
             is_streaming = is_streaming_request(func.__name__, request, **kwargs)
@@ -337,22 +358,11 @@ def main(args: argparse.Namespace | None = None):
         default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
         help="Port to listen on",
     )
-    parser.add_argument("--disable-ipv6", action="store_true", help="Whether to disable IPv6 support")
     parser.add_argument(
         "--env",
         action="append",
         help="Environment variables in KEY=value format. Can be specified multiple times.",
     )
-    parser.add_argument(
-        "--tls-keyfile",
-        help="Path to TLS key file for HTTPS",
-        required="--tls-certfile" in sys.argv,
-    )
-    parser.add_argument(
-        "--tls-certfile",
-        help="Path to TLS certificate file for HTTPS",
-        required="--tls-keyfile" in sys.argv,
-    )
 
     # Determine whether the server args are being passed by the "run" command, if this is the case
     # the args will be passed as a Namespace object to the main function, otherwise they will be
@@ -361,9 +371,9 @@ def main(args: argparse.Namespace | None = None):
         args = parser.parse_args()
 
     # Check for deprecated argument usage
-    if "--yaml-config" in sys.argv:
+    if "--config" in sys.argv:
         warnings.warn(
-            "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
+            "The '--config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -381,7 +391,7 @@ def main(args: argparse.Namespace | None = None):
             raise ValueError(f"Template {args.template} does not exist")
         log_line = f"Using template {args.template} config file: {config_file}"
     else:
-        raise ValueError("Either --yaml-config or --template must be provided")
+        raise ValueError("Either --config or --template must be provided")
 
     logger_config = None
     with open(config_file) as fp:
@@ -486,21 +496,24 @@ def main(args: argparse.Namespace | None = None):
     port = args.port or config.server.port
 
     ssl_config = None
-    if args.tls_keyfile:
-        keyfile = args.tls_keyfile
-        certfile = args.tls_certfile
-    else:
-        keyfile = config.server.tls_keyfile
-        certfile = config.server.tls_certfile
+    keyfile = config.server.tls_keyfile
+    certfile = config.server.tls_certfile
 
     if keyfile and certfile:
         ssl_config = {
             "ssl_keyfile": keyfile,
             "ssl_certfile": certfile,
         }
-        logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+        if config.server.tls_cafile:
+            ssl_config["ssl_ca_certs"] = config.server.tls_cafile
+            ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
+        else:
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
-    listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
+    listen_host = config.server.host or ["::", "0.0.0.0"]
     logger.info(f"Listening on {listen_host}:{port}")
 
     uvicorn_config = {
diff --git a/llama_stack/distribution/start_stack.sh b/llama_stack/distribution/start_stack.sh
index d3e13c7dc..996935a5e 100755
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@@ -29,7 +29,7 @@ error_handler() {
 trap 'error_handler ${LINENO}' ERR
 
 if [ $# -lt 3 ]; then
-  echo "Usage: $0     "
+  echo "Usage: $0    [--config ] [--env KEY=VALUE]..."
   exit 1
 fi
 
@@ -40,37 +40,51 @@ env_path_or_name="$1"
 container_image="localhost/$env_path_or_name"
 shift
 
-yaml_config="$1"
-shift
-
 port="$1"
 shift
 
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"
 
-# Initialize env_vars as an string
+# Initialize variables
+yaml_config=""
 env_vars=""
 other_args=""
-# Process environment variables from --env arguments
+
+# Process remaining arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
-  --env)
-
-    if [[ -n "$2" ]]; then
-      env_vars="$env_vars --env $2"
-      shift 2
-    else
-      echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
-      exit 1
-    fi
-    ;;
-  *)
-    other_args="$other_args $1"
-    shift
-    ;;
+    --config)
+      if [[ -n "$2" ]]; then
+        yaml_config="$2"
+        shift 2
+      else
+        echo -e "${RED}Error: $1 requires a CONFIG argument${NC}" >&2
+        exit 1
+      fi
+      ;;
+    --env)
+      if [[ -n "$2" ]]; then
+        env_vars="$env_vars --env $2"
+        shift 2
+      else
+        echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
+        exit 1
+      fi
+      ;;
+    *)
+      other_args="$other_args $1"
+      shift
+      ;;
   esac
 done
+
+# Check if yaml_config is required based on env_type
+if [[ "$env_type" == "venv" || "$env_type" == "conda" ]] && [ -z "$yaml_config" ]; then
+  echo -e "${RED}Error: --config is required for venv and conda environments${NC}" >&2
+  exit 1
+fi
+
 PYTHON_BINARY="python"
 case "$env_type" in
   "venv")
@@ -106,8 +120,14 @@ esac
 if [[ "$env_type" == "venv" || "$env_type" == "conda" ]]; then
     set -x
 
+    if [ -n "$yaml_config" ]; then
+        yaml_config_arg="--config $yaml_config"
+    else
+        yaml_config_arg=""
+    fi
+
     $PYTHON_BINARY -m llama_stack.distribution.server.server \
-    --yaml-config "$yaml_config" \
+    $yaml_config_arg \
     --port "$port" \
     $env_vars \
     $other_args
@@ -149,15 +169,26 @@ elif [[ "$env_type" == "container" ]]; then
         version_tag=$(curl -s $URL | jq -r '.info.version')
     fi
 
-    $CONTAINER_BINARY run $CONTAINER_OPTS -it \
+    # Build the command with optional yaml config
+    cmd="$CONTAINER_BINARY run $CONTAINER_OPTS -it \
     -p $port:$port \
     $env_vars \
-    -v "$yaml_config:/app/config.yaml" \
     $mounts \
     --env LLAMA_STACK_PORT=$port \
     --entrypoint python \
     $container_image:$version_tag \
-    -m llama_stack.distribution.server.server \
-    --yaml-config /app/config.yaml \
-    $other_args
+    -m llama_stack.distribution.server.server"
+
+    # Add yaml config if provided, otherwise use default
+    if [ -n "$yaml_config" ]; then
+        cmd="$cmd -v $yaml_config:/app/run.yaml --config /app/run.yaml"
+    else
+        cmd="$cmd --config /app/run.yaml"
+    fi
+
+    # Add any other args
+    cmd="$cmd $other_args"
+
+    # Execute the command
+    eval $cmd
 fi
diff --git a/llama_stack/distribution/store/registry.py b/llama_stack/distribution/store/registry.py
index ae97f600c..a6b400136 100644
--- a/llama_stack/distribution/store/registry.py
+++ b/llama_stack/distribution/store/registry.py
@@ -73,7 +73,7 @@ class DiskDistributionRegistry(DistributionRegistry):
 
     async def get_all(self) -> list[RoutableObjectWithProvider]:
         start_key, end_key = _get_registry_key_range()
-        values = await self.kvstore.range(start_key, end_key)
+        values = await self.kvstore.values_in_range(start_key, end_key)
         return _parse_registry_values(values)
 
     async def get(self, type: str, identifier: str) -> RoutableObjectWithProvider | None:
@@ -134,7 +134,7 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
                 return
 
             start_key, end_key = _get_registry_key_range()
-            values = await self.kvstore.range(start_key, end_key)
+            values = await self.kvstore.values_in_range(start_key, end_key)
             objects = _parse_registry_values(values)
 
             async with self._locked_cache() as cache:
diff --git a/llama_stack/distribution/ui/page/playground/chat.py b/llama_stack/distribution/ui/page/playground/chat.py
index 8e7345169..fcaf08795 100644
--- a/llama_stack/distribution/ui/page/playground/chat.py
+++ b/llama_stack/distribution/ui/page/playground/chat.py
@@ -124,7 +124,7 @@ if prompt := st.chat_input("Example: What is Llama Stack?"):
                 message_placeholder.markdown(full_response + "▌")
             message_placeholder.markdown(full_response)
         else:
-            full_response = response
-            message_placeholder.markdown(full_response.completion_message.content)
+            full_response = response.completion_message.content
+            message_placeholder.markdown(full_response)
 
         st.session_state.messages.append({"role": "assistant", "content": full_response})
diff --git a/llama_stack/distribution/utils/config_dirs.py b/llama_stack/distribution/utils/config_dirs.py
index 9b9a7ceb3..c3e520f28 100644
--- a/llama_stack/distribution/utils/config_dirs.py
+++ b/llama_stack/distribution/utils/config_dirs.py
@@ -14,3 +14,5 @@ DISTRIBS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "distributions"
 DEFAULT_CHECKPOINT_DIR = LLAMA_STACK_CONFIG_DIR / "checkpoints"
 
 RUNTIME_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "runtime"
+
+EXTERNAL_PROVIDERS_DIR = LLAMA_STACK_CONFIG_DIR / "providers.d"
diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py
index 3bf3c81ce..4acce4f5b 100644
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@@ -22,8 +22,10 @@ from llama_stack.distribution.utils.image_types import LlamaStackImageType
 
 def formulate_run_args(image_type, image_name, config, template_name) -> list:
     env_name = ""
-    if image_type == LlamaStackImageType.CONTAINER.value or config.container_image:
-        env_name = f"distribution-{template_name}" if template_name else config.container_image
+    if image_type == LlamaStackImageType.CONTAINER.value:
+        env_name = (
+            f"distribution-{template_name}" if template_name else (config.container_image if config else image_name)
+        )
     elif image_type == LlamaStackImageType.CONDA.value:
         current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
         env_name = image_name or current_conda_env
diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
index 8e6f97012..ab626e5af 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@@ -245,7 +245,7 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
             {"function_description": self._gen_function_description(custom_tools)},
         )
 
-    def _gen_function_description(self, custom_tools: list[ToolDefinition]) -> PromptTemplate:
+    def _gen_function_description(self, custom_tools: list[ToolDefinition]) -> str:
         template_str = textwrap.dedent(
             """
             Here is a list of functions in JSON format that you can invoke.
@@ -286,10 +286,12 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
 
             """
         )
-        return PromptTemplate(
+        template = PromptTemplate(
             template_str.strip("\n"),
             {"tools": [t.model_dump() for t in custom_tools]},
-        ).render()
+        )
+        rendered: str = template.render()
+        return rendered
 
     def data_examples(self) -> list[list[ToolDefinition]]:
         return [
diff --git a/llama_stack/models/llama/llama4/prompt_format.md b/llama_stack/models/llama/llama4/prompt_format.md
index 350a5517a..7ae998310 100644
--- a/llama_stack/models/llama/llama4/prompt_format.md
+++ b/llama_stack/models/llama/llama4/prompt_format.md
@@ -173,9 +173,7 @@ INCORRECT: [get_events(location="Singapore")] <- If function not in list
 - Don't repeat tool response verbatim
 - Don't add supplementary information
 
-
-Here is a list of functions in JSON format that you can invoke.
-
+Here is a list of functions in JSON format that you can invoke:
 [
     {
         "name": "get_weather",
@@ -196,10 +194,7 @@ Here is a list of functions in JSON format that you can invoke.
             }
         }
     }
-]
-
-You can answer general questions or invoke tools when necessary.
-In addition to tool calls, you should also augment your responses by using the tool outputs.<|eot|><|header_start|>user<|header_end|>
+]<|eot|><|header_start|>user<|header_end|>
 
 What is the weather in SF and Seattle?<|eot|><|header_start|>assistant<|header_end|>
 
diff --git a/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
index ceb28300f..9c19f89ae 100644
--- a/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
@@ -61,7 +61,6 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
         - Don't repeat tool response verbatim
         - Don't add supplementary information
 
-
         {{ function_description }}
         """.strip("\n")
     )
@@ -76,8 +75,7 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
     def _gen_function_description(self, custom_tools: list[ToolDefinition]) -> PromptTemplate:
         template_str = textwrap.dedent(
             """
-            Here is a list of functions in JSON format that you can invoke.
-
+            Here is a list of functions in JSON format that you can invoke:
             [
                 {% for t in tools -%}
                 {# manually setting up JSON because jinja sorts keys in unexpected ways -#}
@@ -108,10 +106,6 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
                 {% endif -%}
                 {%- endfor %}
             ]
-
-            You can answer general questions or invoke tools when necessary.
-            In addition to tool calls, you should also augment your responses by using the tool outputs.
-
             """
         )
         return PromptTemplate(
diff --git a/llama_stack/models/llama/sku_list.py b/llama_stack/models/llama/sku_list.py
index a82cbf708..271cec63f 100644
--- a/llama_stack/models/llama/sku_list.py
+++ b/llama_stack/models/llama/sku_list.py
@@ -948,6 +948,8 @@ def llama_meta_net_info(model: Model) -> LlamaDownloadInfo:
     elif model.core_model_id == CoreModelId.llama_guard_2_8b:
         folder = "llama-guard-2"
     else:
+        if model.huggingface_repo is None:
+            raise ValueError(f"Model {model.core_model_id} has no huggingface_repo set")
         folder = model.huggingface_repo.split("/")[-1]
         if "Llama-2" in folder:
             folder = folder.lower()
@@ -1024,3 +1026,4 @@ def llama_meta_pth_size(model: Model) -> int:
                 return 54121549657
             else:
                 return 100426653046
+    return 0
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 3807ddb86..2e387e7e8 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -95,6 +95,7 @@ class ChatAgent(ShieldRunnerMixin):
         tool_groups_api: ToolGroups,
         vector_io_api: VectorIO,
         persistence_store: KVStore,
+        created_at: str,
     ):
         self.agent_id = agent_id
         self.agent_config = agent_config
@@ -104,6 +105,7 @@ class ChatAgent(ShieldRunnerMixin):
         self.storage = AgentPersistence(agent_id, persistence_store)
         self.tool_runtime_api = tool_runtime_api
         self.tool_groups_api = tool_groups_api
+        self.created_at = created_at
 
         ShieldRunnerMixin.__init__(
             self,
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index a87739925..86780fd61 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import json
 import logging
 import uuid
 from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
 
 from llama_stack.apis.agents import (
     Agent,
@@ -20,14 +20,13 @@ from llama_stack.apis.agents import (
     AgentTurnCreateRequest,
     AgentTurnResumeRequest,
     Document,
-    ListAgentSessionsResponse,
-    ListAgentsResponse,
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
     OpenAIResponseInputTool,
     OpenAIResponseObject,
     Session,
     Turn,
 )
+from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.inference import (
     Inference,
     ToolConfig,
@@ -39,13 +38,14 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
 from .openai_responses import OpenAIResponsesImpl
+from .persistence import AgentInfo
 
 logger = logging.getLogger()
-logger.setLevel(logging.INFO)
 
 
 class MetaReferenceAgentsImpl(Agents):
@@ -82,43 +82,47 @@ class MetaReferenceAgentsImpl(Agents):
         agent_config: AgentConfig,
     ) -> AgentCreateResponse:
         agent_id = str(uuid.uuid4())
+        created_at = datetime.now(timezone.utc)
 
+        agent_info = AgentInfo(
+            **agent_config.model_dump(),
+            created_at=created_at,
+        )
+
+        # Store the agent info
         await self.persistence_store.set(
             key=f"agent:{agent_id}",
-            value=agent_config.model_dump_json(),
+            value=agent_info.model_dump_json(),
         )
+
         return AgentCreateResponse(
             agent_id=agent_id,
         )
 
     async def _get_agent_impl(self, agent_id: str) -> ChatAgent:
-        agent_config = await self.persistence_store.get(
+        agent_info_json = await self.persistence_store.get(
             key=f"agent:{agent_id}",
         )
-        if not agent_config:
-            raise ValueError(f"Could not find agent config for {agent_id}")
+        if not agent_info_json:
+            raise ValueError(f"Could not find agent info for {agent_id}")
 
         try:
-            agent_config = json.loads(agent_config)
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Could not JSON decode agent config for {agent_id}") from e
-
-        try:
-            agent_config = AgentConfig(**agent_config)
+            agent_info = AgentInfo.model_validate_json(agent_info_json)
         except Exception as e:
-            raise ValueError(f"Could not validate(?) agent config for {agent_id}") from e
+            raise ValueError(f"Could not validate agent info for {agent_id}") from e
 
         return ChatAgent(
             agent_id=agent_id,
-            agent_config=agent_config,
+            agent_config=agent_info,
             inference_api=self.inference_api,
             safety_api=self.safety_api,
             vector_io_api=self.vector_io_api,
             tool_runtime_api=self.tool_runtime_api,
             tool_groups_api=self.tool_groups_api,
             persistence_store=(
-                self.persistence_store if agent_config.enable_session_persistence else self.in_memory_store
+                self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
             ),
+            created_at=agent_info.created_at,
         )
 
     async def create_agent_session(
@@ -212,6 +216,7 @@ class MetaReferenceAgentsImpl(Agents):
         turn_ids: list[str] | None = None,
     ) -> Session:
         agent = await self._get_agent_impl(agent_id)
+
         session_info = await agent.storage.get_session_info(session_id)
         if session_info is None:
             raise ValueError(f"Session {session_id} not found")
@@ -226,24 +231,75 @@ class MetaReferenceAgentsImpl(Agents):
         )
 
     async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
-        await self.persistence_store.delete(f"session:{agent_id}:{session_id}")
+        agent = await self._get_agent_impl(agent_id)
+        session_info = await agent.storage.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        # Delete turns first, then the session
+        await agent.storage.delete_session_turns(session_id)
+        await agent.storage.delete_session(session_id)
 
     async def delete_agent(self, agent_id: str) -> None:
+        # First get all sessions for this agent
+        agent = await self._get_agent_impl(agent_id)
+        sessions = await agent.storage.list_sessions()
+
+        # Delete all sessions
+        for session in sessions:
+            await self.delete_agents_session(agent_id, session.session_id)
+
+        # Finally delete the agent itself
         await self.persistence_store.delete(f"agent:{agent_id}")
 
-    async def shutdown(self) -> None:
-        pass
+    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
+        agent_keys = await self.persistence_store.keys_in_range("agent:", "agent:\xff")
+        agent_list: list[Agent] = []
+        for agent_key in agent_keys:
+            agent_id = agent_key.split(":")[1]
 
-    async def list_agents(self) -> ListAgentsResponse:
-        pass
+            # Get the agent info using the key
+            agent_info_json = await self.persistence_store.get(agent_key)
+            if not agent_info_json:
+                logger.error(f"Could not find agent info for key {agent_key}")
+                continue
+
+            try:
+                agent_info = AgentInfo.model_validate_json(agent_info_json)
+                agent_list.append(
+                    Agent(
+                        agent_id=agent_id,
+                        agent_config=agent_info,
+                        created_at=agent_info.created_at,
+                    )
+                )
+            except Exception as e:
+                logger.error(f"Error parsing agent info for {agent_id}: {e}")
+                continue
+
+        # Convert Agent objects to dictionaries
+        agent_dicts = [agent.model_dump() for agent in agent_list]
+        return paginate_records(agent_dicts, start_index, limit)
 
     async def get_agent(self, agent_id: str) -> Agent:
-        pass
+        chat_agent = await self._get_agent_impl(agent_id)
+        agent = Agent(
+            agent_id=agent_id,
+            agent_config=chat_agent.agent_config,
+            created_at=chat_agent.created_at,
+        )
+        return agent
 
     async def list_agent_sessions(
-        self,
-        agent_id: str,
-    ) -> ListAgentSessionsResponse:
+        self, agent_id: str, start_index: int | None = None, limit: int | None = None
+    ) -> PaginatedResponse:
+        agent = await self._get_agent_impl(agent_id)
+        sessions = await agent.storage.list_sessions()
+        # Convert Session objects to dictionaries
+        session_dicts = [session.model_dump() for session in sessions]
+        return paginate_records(session_dicts, start_index, limit)
+
+    async def shutdown(self) -> None:
         pass
 
     # OpenAI responses
@@ -255,7 +311,7 @@ class MetaReferenceAgentsImpl(Agents):
 
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index 24a99dd6e..6d9d06109 100644
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -7,22 +7,29 @@
 import json
 import uuid
 from collections.abc import AsyncIterator
-from typing import cast
+from typing import Any, cast
 
 from openai.types.chat import ChatCompletionToolParam
+from pydantic import BaseModel
 
 from llama_stack.apis.agents.openai_responses import (
-    OpenAIResponseInputMessage,
+    OpenAIResponseInput,
+    OpenAIResponseInputFunctionToolCallOutput,
+    OpenAIResponseInputItemList,
+    OpenAIResponseInputMessageContent,
     OpenAIResponseInputMessageContentImage,
     OpenAIResponseInputMessageContentText,
     OpenAIResponseInputTool,
+    OpenAIResponseInputToolFunction,
+    OpenAIResponseMessage,
     OpenAIResponseObject,
     OpenAIResponseObjectStream,
     OpenAIResponseObjectStreamResponseCompleted,
     OpenAIResponseObjectStreamResponseCreated,
     OpenAIResponseOutput,
-    OpenAIResponseOutputMessage,
+    OpenAIResponseOutputMessageContent,
     OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageFunctionToolCall,
     OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.inference.inference import (
@@ -32,10 +39,13 @@ from llama_stack.apis.inference.inference import (
     OpenAIChatCompletionContentPartImageParam,
     OpenAIChatCompletionContentPartParam,
     OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
     OpenAIChatCompletionToolCallFunction,
     OpenAIChoice,
+    OpenAIDeveloperMessageParam,
     OpenAIImageURL,
     OpenAIMessageParam,
+    OpenAISystemMessageParam,
     OpenAIToolMessageParam,
     OpenAIUserMessageParam,
 )
@@ -50,31 +60,110 @@ logger = get_logger(name=__name__, category="openai_responses")
 OPENAI_RESPONSES_PREFIX = "openai_responses:"
 
 
-async def _previous_response_to_messages(previous_response: OpenAIResponseObject) -> list[OpenAIMessageParam]:
+async def _convert_response_content_to_chat_content(
+    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent],
+) -> str | list[OpenAIChatCompletionContentPartParam]:
+    """
+    Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts.
+
+    The content schemas of each API look similar, but are not exactly the same.
+    """
+    if isinstance(content, str):
+        return content
+
+    converted_parts = []
+    for content_part in content:
+        if isinstance(content_part, OpenAIResponseInputMessageContentText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
+        elif isinstance(content_part, OpenAIResponseInputMessageContentImage):
+            if content_part.image_url:
+                image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail)
+                converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
+        elif isinstance(content_part, str):
+            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part))
+        else:
+            raise ValueError(
+                f"Llama Stack OpenAI Responses does not yet support content type '{type(content_part)}' in this context"
+            )
+    return converted_parts
+
+
+async def _convert_response_input_to_chat_messages(
+    input: str | list[OpenAIResponseInput],
+) -> list[OpenAIMessageParam]:
+    """
+    Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages.
+    """
     messages: list[OpenAIMessageParam] = []
-    for output_message in previous_response.output:
-        if isinstance(output_message, OpenAIResponseOutputMessage):
-            messages.append(OpenAIAssistantMessageParam(content=output_message.content[0].text))
+    if isinstance(input, list):
+        for input_item in input:
+            if isinstance(input_item, OpenAIResponseInputFunctionToolCallOutput):
+                messages.append(
+                    OpenAIToolMessageParam(
+                        content=input_item.output,
+                        tool_call_id=input_item.call_id,
+                    )
+                )
+            elif isinstance(input_item, OpenAIResponseOutputMessageFunctionToolCall):
+                tool_call = OpenAIChatCompletionToolCall(
+                    index=0,
+                    id=input_item.call_id,
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=input_item.name,
+                        arguments=input_item.arguments,
+                    ),
+                )
+                messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+            else:
+                content = await _convert_response_content_to_chat_content(input_item.content)
+                message_type = await _get_message_type_by_role(input_item.role)
+                if message_type is None:
+                    raise ValueError(
+                        f"Llama Stack OpenAI Responses does not yet support message role '{input_item.role}' in this context"
+                    )
+                messages.append(message_type(content=content))
+    else:
+        messages.append(OpenAIUserMessageParam(content=input))
     return messages
 
 
-async def _openai_choices_to_output_messages(choices: list[OpenAIChoice]) -> list[OpenAIResponseOutputMessage]:
-    output_messages = []
-    for choice in choices:
-        output_content = ""
-        if isinstance(choice.message.content, str):
-            output_content = choice.message.content
-        elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
-            output_content = choice.message.content.text
-        # TODO: handle image content
-        output_messages.append(
-            OpenAIResponseOutputMessage(
-                id=f"msg_{uuid.uuid4()}",
-                content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
-                status="completed",
-            )
+async def _convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage:
+    """
+    Convert an OpenAI Chat Completion choice into an OpenAI Response output message.
+    """
+    output_content = ""
+    if isinstance(choice.message.content, str):
+        output_content = choice.message.content
+    elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
+        output_content = choice.message.content.text
+    else:
+        raise ValueError(
+            f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}"
         )
-    return output_messages
+
+    return OpenAIResponseMessage(
+        id=f"msg_{uuid.uuid4()}",
+        content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
+        status="completed",
+        role="assistant",
+    )
+
+
+async def _get_message_type_by_role(role: str):
+    role_to_type = {
+        "user": OpenAIUserMessageParam,
+        "system": OpenAISystemMessageParam,
+        "assistant": OpenAIAssistantMessageParam,
+        "developer": OpenAIDeveloperMessageParam,
+    }
+    return role_to_type.get(role)
+
+
+class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
+    input_items: OpenAIResponseInputItemList
+    response: OpenAIResponseObject
 
 
 class OpenAIResponsesImpl:
@@ -90,19 +179,45 @@ class OpenAIResponsesImpl:
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
 
-    async def get_openai_response(
-        self,
-        id: str,
-    ) -> OpenAIResponseObject:
+    async def _get_previous_response_with_input(self, id: str) -> OpenAIResponsePreviousResponseWithInputItems:
         key = f"{OPENAI_RESPONSES_PREFIX}{id}"
         response_json = await self.persistence_store.get(key=key)
         if response_json is None:
             raise ValueError(f"OpenAI response with id '{id}' not found")
-        return OpenAIResponseObject.model_validate_json(response_json)
+        return OpenAIResponsePreviousResponseWithInputItems.model_validate_json(response_json)
+
+    async def _prepend_previous_response(
+        self, input: str | list[OpenAIResponseInput], previous_response_id: str | None = None
+    ):
+        if previous_response_id:
+            previous_response_with_input = await self._get_previous_response_with_input(previous_response_id)
+
+            # previous response input items
+            new_input_items = previous_response_with_input.input_items.data
+
+            # previous response output items
+            new_input_items.extend(previous_response_with_input.response.output)
+
+            # new input items from the current request
+            if isinstance(input, str):
+                new_input_items.append(OpenAIResponseMessage(content=input, role="user"))
+            else:
+                new_input_items.extend(input)
+
+            input = new_input_items
+
+        return input
+
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        response_with_input = await self._get_previous_response_with_input(id)
+        return response_with_input.response
 
     async def create_openai_response(
         self,
-        input: str | list[OpenAIResponseInputMessage],
+        input: str | list[OpenAIResponseInput],
         model: str,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -112,31 +227,8 @@ class OpenAIResponsesImpl:
     ):
         stream = False if stream is None else stream
 
-        messages: list[OpenAIMessageParam] = []
-        if previous_response_id:
-            previous_response = await self.get_openai_response(previous_response_id)
-            messages.extend(await _previous_response_to_messages(previous_response))
-        # TODO: refactor this user_content parsing out into a separate method
-        user_content: str | list[OpenAIChatCompletionContentPartParam] = ""
-        if isinstance(input, list):
-            user_content = []
-            for user_input in input:
-                if isinstance(user_input.content, list):
-                    for user_input_content in user_input.content:
-                        if isinstance(user_input_content, OpenAIResponseInputMessageContentText):
-                            user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input_content.text))
-                        elif isinstance(user_input_content, OpenAIResponseInputMessageContentImage):
-                            if user_input_content.image_url:
-                                image_url = OpenAIImageURL(
-                                    url=user_input_content.image_url, detail=user_input_content.detail
-                                )
-                                user_content.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
-                else:
-                    user_content.append(OpenAIChatCompletionContentPartTextParam(text=user_input.content))
-        else:
-            user_content = input
-        messages.append(OpenAIUserMessageParam(content=user_content))
-
+        input = await self._prepend_previous_response(input, previous_response_id)
+        messages = await _convert_response_input_to_chat_messages(input)
         chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
         chat_response = await self.inference_api.openai_chat_completion(
             model=model,
@@ -150,6 +242,7 @@ class OpenAIResponsesImpl:
             # TODO: refactor this into a separate method that handles streaming
             chat_response_id = ""
             chat_response_content = []
+            chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
             # TODO: these chunk_ fields are hacky and only take the last chunk into account
             chunk_created = 0
             chunk_model = ""
@@ -163,7 +256,30 @@ class OpenAIResponsesImpl:
                     chat_response_content.append(chunk_choice.delta.content or "")
                     if chunk_choice.finish_reason:
                         chunk_finish_reason = chunk_choice.finish_reason
-            assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content))
+
+                    # Aggregate tool call arguments across chunks, using their index as the aggregation key
+                    if chunk_choice.delta.tool_calls:
+                        for tool_call in chunk_choice.delta.tool_calls:
+                            response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
+                            if response_tool_call:
+                                response_tool_call.function.arguments += tool_call.function.arguments
+                            else:
+                                tool_call_dict: dict[str, Any] = tool_call.model_dump()
+                                # Ensure we don't have any empty type field in the tool call dict.
+                                # The OpenAI client used by providers often returns a type=None here.
+                                tool_call_dict.pop("type", None)
+                                response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
+                            chat_response_tool_calls[tool_call.index] = response_tool_call
+
+            # Convert the dict of tool calls by index to a list of tool calls to pass back in our response
+            if chat_response_tool_calls:
+                tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
+            else:
+                tool_calls = None
+            assistant_message = OpenAIAssistantMessageParam(
+                content="".join(chat_response_content),
+                tool_calls=tool_calls,
+            )
             chat_response = OpenAIChatCompletion(
                 id=chat_response_id,
                 choices=[
@@ -181,12 +297,26 @@ class OpenAIResponsesImpl:
             chat_response = OpenAIChatCompletion(**chat_response.model_dump())
 
         output_messages: list[OpenAIResponseOutput] = []
-        if chat_response.choices[0].message.tool_calls:
-            output_messages.extend(
-                await self._execute_tool_and_return_final_output(model, stream, chat_response, messages, temperature)
-            )
-        else:
-            output_messages.extend(await _openai_choices_to_output_messages(chat_response.choices))
+        for choice in chat_response.choices:
+            if choice.message.tool_calls and tools:
+                # Assume if the first tool is a function, all tools are functions
+                if isinstance(tools[0], OpenAIResponseInputToolFunction):
+                    for tool_call in choice.message.tool_calls:
+                        output_messages.append(
+                            OpenAIResponseOutputMessageFunctionToolCall(
+                                arguments=tool_call.function.arguments or "",
+                                call_id=tool_call.id,
+                                name=tool_call.function.name or "",
+                                id=f"fc_{uuid.uuid4()}",
+                                status="completed",
+                            )
+                        )
+                else:
+                    output_messages.extend(
+                        await self._execute_tool_and_return_final_output(model, stream, choice, messages, temperature)
+                    )
+            else:
+                output_messages.append(await _convert_chat_choice_to_response_message(choice))
         response = OpenAIResponseObject(
             created_at=chat_response.created,
             id=f"resp-{uuid.uuid4()}",
@@ -195,13 +325,43 @@ class OpenAIResponsesImpl:
             status="completed",
             output=output_messages,
         )
+        logger.debug(f"OpenAI Responses response: {response}")
 
         if store:
             # Store in kvstore
+
+            new_input_id = f"msg_{uuid.uuid4()}"
+            if isinstance(input, str):
+                # synthesize a message from the input string
+                input_content = OpenAIResponseInputMessageContentText(text=input)
+                input_content_item = OpenAIResponseMessage(
+                    role="user",
+                    content=[input_content],
+                    id=new_input_id,
+                )
+                input_items_data = [input_content_item]
+            else:
+                # we already have a list of messages
+                input_items_data = []
+                for input_item in input:
+                    if isinstance(input_item, OpenAIResponseMessage):
+                        # These may or may not already have an id, so dump to dict, check for id, and add if missing
+                        input_item_dict = input_item.model_dump()
+                        if "id" not in input_item_dict:
+                            input_item_dict["id"] = new_input_id
+                        input_items_data.append(OpenAIResponseMessage(**input_item_dict))
+                    else:
+                        input_items_data.append(input_item)
+
+            input_items = OpenAIResponseInputItemList(data=input_items_data)
+            prev_response = OpenAIResponsePreviousResponseWithInputItems(
+                input_items=input_items,
+                response=response,
+            )
             key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
             await self.persistence_store.set(
                 key=key,
-                value=response.model_dump_json(),
+                value=prev_response.model_dump_json(),
             )
 
         if stream:
@@ -221,7 +381,9 @@ class OpenAIResponsesImpl:
         chat_tools: list[ChatCompletionToolParam] = []
         for input_tool in tools:
             # TODO: Handle other tool types
-            if input_tool.type == "web_search":
+            if input_tool.type == "function":
+                chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))
+            elif input_tool.type == "web_search":
                 tool_name = "web_search"
                 tool = await self.tool_groups_api.get_tool(tool_name)
                 tool_def = ToolDefinition(
@@ -247,12 +409,11 @@ class OpenAIResponsesImpl:
         self,
         model_id: str,
         stream: bool,
-        chat_response: OpenAIChatCompletion,
+        choice: OpenAIChoice,
         messages: list[OpenAIMessageParam],
         temperature: float,
     ) -> list[OpenAIResponseOutput]:
         output_messages: list[OpenAIResponseOutput] = []
-        choice = chat_response.choices[0]
 
         # If the choice is not an assistant message, we don't need to execute any tools
         if not isinstance(choice.message, OpenAIAssistantMessageParam):
@@ -262,6 +423,9 @@ class OpenAIResponsesImpl:
         if not choice.message.tool_calls:
             return output_messages
 
+        # Copy the messages list to avoid mutating the original list
+        messages = messages.copy()
+
         # Add the assistant message with tool_calls response to the messages list
         messages.append(choice.message)
 
@@ -307,7 +471,9 @@ class OpenAIResponsesImpl:
         )
         # type cast to appease mypy
         tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
-        tool_final_outputs = await _openai_choices_to_output_messages(tool_results_chat_response.choices)
+        tool_final_outputs = [
+            await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
+        ]
         # TODO: Wire in annotations with URLs, titles, etc to these output messages
         output_messages.extend(tool_final_outputs)
         return output_messages
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 60ce91395..5031a4a90 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -9,9 +9,7 @@ import logging
 import uuid
 from datetime import datetime, timezone
 
-from pydantic import BaseModel
-
-from llama_stack.apis.agents import ToolExecutionStep, Turn
+from llama_stack.apis.agents import AgentConfig, Session, ToolExecutionStep, Turn
 from llama_stack.distribution.access_control import check_access
 from llama_stack.distribution.datatypes import AccessAttributes
 from llama_stack.distribution.request_headers import get_auth_attributes
@@ -20,15 +18,17 @@ from llama_stack.providers.utils.kvstore import KVStore
 log = logging.getLogger(__name__)
 
 
-class AgentSessionInfo(BaseModel):
-    session_id: str
-    session_name: str
+class AgentSessionInfo(Session):
     # TODO: is this used anywhere?
     vector_db_id: str | None = None
     started_at: datetime
     access_attributes: AccessAttributes | None = None
 
 
+class AgentInfo(AgentConfig):
+    created_at: datetime
+
+
 class AgentPersistence:
     def __init__(self, agent_id: str, kvstore: KVStore):
         self.agent_id = agent_id
@@ -46,6 +46,7 @@ class AgentPersistence:
             session_name=name,
             started_at=datetime.now(timezone.utc),
             access_attributes=access_attributes,
+            turns=[],
         )
 
         await self.kvstore.set(
@@ -109,7 +110,7 @@ class AgentPersistence:
         if not await self.get_session_if_accessible(session_id):
             raise ValueError(f"Session {session_id} not found or access denied")
 
-        values = await self.kvstore.range(
+        values = await self.kvstore.values_in_range(
             start_key=f"session:{self.agent_id}:{session_id}:",
             end_key=f"session:{self.agent_id}:{session_id}:\xff\xff\xff\xff",
         )
@@ -121,7 +122,6 @@ class AgentPersistence:
             except Exception as e:
                 log.error(f"Error parsing turn: {e}")
                 continue
-        turns.sort(key=lambda x: (x.completed_at or datetime.min))
         return turns
 
     async def get_session_turn(self, session_id: str, turn_id: str) -> Turn | None:
@@ -170,3 +170,43 @@ class AgentPersistence:
             key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
         )
         return int(value) if value else None
+
+    async def list_sessions(self) -> list[Session]:
+        values = await self.kvstore.values_in_range(
+            start_key=f"session:{self.agent_id}:",
+            end_key=f"session:{self.agent_id}:\xff\xff\xff\xff",
+        )
+        sessions = []
+        for value in values:
+            try:
+                session_info = Session(**json.loads(value))
+                sessions.append(session_info)
+            except Exception as e:
+                log.error(f"Error parsing session info: {e}")
+                continue
+        return sessions
+
+    async def delete_session_turns(self, session_id: str) -> None:
+        """Delete all turns and their associated data for a session.
+
+        Args:
+            session_id: The ID of the session whose turns should be deleted.
+        """
+        turns = await self.get_session_turns(session_id)
+        for turn in turns:
+            await self.kvstore.delete(key=f"session:{self.agent_id}:{session_id}:{turn.turn_id}")
+
+    async def delete_session(self, session_id: str) -> None:
+        """Delete a session and all its associated turns.
+
+        Args:
+            session_id: The ID of the session to delete.
+
+        Raises:
+            ValueError: If the session does not exist.
+        """
+        session_info = await self.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        await self.kvstore.delete(key=f"session:{self.agent_id}:{session_id}")
diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
index 260a640bd..da71ecb17 100644
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@@ -11,9 +11,9 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import LocalFSDatasetIOConfig
 
@@ -64,7 +64,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         # Load existing datasets from kvstore
         start_key = DATASETS_PREFIX
         end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.range(start_key, end_key)
+        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
 
         for dataset in stored_datasets:
             dataset = Dataset.model_validate_json(dataset)
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index 12dde66b5..bc0898dc5 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -58,7 +58,7 @@ class MetaReferenceEvalImpl(
         # Load existing benchmarks from kvstore
         start_key = EVAL_TASKS_PREFIX
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_benchmarks = await self.kvstore.range(start_key, end_key)
+        stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key)
 
         for benchmark in stored_benchmarks:
             benchmark = Benchmark.model_validate_json(benchmark)
diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py
index 8dd594869..048336f9e 100644
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseStreamChunk,
-    Inference,
+    InferenceProvider,
     InterleavedContent,
     LogProbConfig,
     Message,
@@ -86,7 +86,7 @@ class MetaReferenceInferenceImpl(
     OpenAICompletionToLlamaStackMixin,
     OpenAIChatCompletionToLlamaStackMixin,
     SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, config: MetaReferenceInferenceConfig) -> None:
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
index 7b36b0997..890c526f5 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator
 
 from llama_stack.apis.inference import (
     CompletionResponse,
-    Inference,
+    InferenceProvider,
     InterleavedContent,
     LogProbConfig,
     Message,
@@ -38,7 +38,7 @@ class SentenceTransformersInferenceImpl(
     OpenAIChatCompletionToLlamaStackMixin,
     OpenAICompletionToLlamaStackMixin,
     SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
diff --git a/llama_stack/providers/inline/post_training/common/utils.py b/llama_stack/providers/inline/post_training/common/utils.py
new file mode 100644
index 000000000..7840b21e8
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/common/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import gc
+
+
+def evacuate_model_from_device(model, device: str):
+    """Safely clear a model from memory and free device resources.
+    This function handles the proper cleanup of a model by:
+    1. Moving the model to CPU if it's on a non-CPU device
+    2. Deleting the model object to free memory
+    3. Running garbage collection
+    4. Clearing CUDA cache if the model was on a CUDA device
+    Args:
+        model: The PyTorch model to clear
+        device: The device type the model is currently on ('cuda', 'mps', 'cpu')
+    Note:
+        - For CUDA devices, this will clear the CUDA cache after moving the model to CPU
+        - For MPS devices, only moves the model to CPU (no cache clearing available)
+        - For CPU devices, only deletes the model object and runs garbage collection
+    """
+    if device != "cpu":
+        model.to("cpu")
+
+    del model
+    gc.collect()
+
+    if device == "cuda":
+        # we need to import such that this is only imported when the method is called
+        import torch
+
+        torch.cuda.empty_cache()
diff --git a/llama_stack/providers/inline/post_training/huggingface/__init__.py b/llama_stack/providers/inline/post_training/huggingface/__init__.py
new file mode 100644
index 000000000..cc1a671c1
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.distribution.datatypes import Api
+
+from .config import HuggingFacePostTrainingConfig
+
+# post_training api and the huggingface provider is still experimental and under heavy development
+
+
+async def get_provider_impl(
+    config: HuggingFacePostTrainingConfig,
+    deps: dict[Api, Any],
+):
+    from .post_training import HuggingFacePostTrainingImpl
+
+    impl = HuggingFacePostTrainingImpl(
+        config,
+        deps[Api.datasetio],
+        deps[Api.datasets],
+    )
+    return impl
diff --git a/llama_stack/providers/inline/post_training/huggingface/config.py b/llama_stack/providers/inline/post_training/huggingface/config.py
new file mode 100644
index 000000000..06c6d8073
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/config.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Literal
+
+from pydantic import BaseModel
+
+
+class HuggingFacePostTrainingConfig(BaseModel):
+    # Device to run training on (cuda, cpu, mps)
+    device: str = "cuda"
+
+    # Distributed training backend if using multiple devices
+    # fsdp: Fully Sharded Data Parallel
+    # deepspeed: DeepSpeed ZeRO optimization
+    distributed_backend: Literal["fsdp", "deepspeed"] | None = None
+
+    # Format for saving model checkpoints
+    # full_state: Save complete model state
+    # huggingface: Save in HuggingFace format (recommended for compatibility)
+    checkpoint_format: Literal["full_state", "huggingface"] | None = "huggingface"
+
+    # Template for formatting chat inputs and outputs
+    # Used to structure the conversation format for training
+    chat_template: str = "<|user|>\n{input}\n<|assistant|>\n{output}"
+
+    # Model-specific configuration parameters
+    # trust_remote_code: Allow execution of custom model code
+    # attn_implementation: Use SDPA (Scaled Dot Product Attention) for better performance
+    model_specific_config: dict = {
+        "trust_remote_code": True,
+        "attn_implementation": "sdpa",
+    }
+
+    # Maximum sequence length for training
+    # Set to 2048 as this is the maximum that works reliably on MPS (Apple Silicon)
+    # Longer sequences may cause memory issues on MPS devices
+    max_seq_length: int = 2048
+
+    # Enable gradient checkpointing to reduce memory usage
+    # Trades computation for memory by recomputing activations
+    gradient_checkpointing: bool = False
+
+    # Maximum number of checkpoints to keep
+    # Older checkpoints are deleted when this limit is reached
+    save_total_limit: int = 3
+
+    # Number of training steps between logging updates
+    logging_steps: int = 10
+
+    # Ratio of training steps used for learning rate warmup
+    # Helps stabilize early training
+    warmup_ratio: float = 0.1
+
+    # L2 regularization coefficient
+    # Helps prevent overfitting
+    weight_decay: float = 0.01
+
+    # Number of worker processes for data loading
+    # Higher values can improve data loading speed but increase memory usage
+    dataloader_num_workers: int = 4
+
+    # Whether to pin memory in data loader
+    # Can improve data transfer speed to GPU but uses more memory
+    dataloader_pin_memory: bool = True
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
+        return {"checkpoint_format": "huggingface", "distributed_backend": None, "device": "cpu"}
diff --git a/llama_stack/providers/inline/post_training/huggingface/post_training.py b/llama_stack/providers/inline/post_training/huggingface/post_training.py
new file mode 100644
index 000000000..0b2760792
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/post_training.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from enum import Enum
+from typing import Any
+
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.post_training import (
+    AlgorithmConfig,
+    Checkpoint,
+    DPOAlignmentConfig,
+    JobStatus,
+    ListPostTrainingJobsResponse,
+    PostTrainingJob,
+    PostTrainingJobArtifactsResponse,
+    PostTrainingJobStatusResponse,
+    TrainingConfig,
+)
+from llama_stack.providers.inline.post_training.huggingface.config import (
+    HuggingFacePostTrainingConfig,
+)
+from llama_stack.providers.inline.post_training.huggingface.recipes.finetune_single_device import (
+    HFFinetuningSingleDevice,
+)
+from llama_stack.providers.utils.scheduler import JobArtifact, Scheduler
+from llama_stack.providers.utils.scheduler import JobStatus as SchedulerJobStatus
+from llama_stack.schema_utils import webmethod
+
+
+class TrainingArtifactType(Enum):
+    CHECKPOINT = "checkpoint"
+    RESOURCES_STATS = "resources_stats"
+
+
+_JOB_TYPE_SUPERVISED_FINE_TUNE = "supervised-fine-tune"
+
+
+class HuggingFacePostTrainingImpl:
+    def __init__(
+        self,
+        config: HuggingFacePostTrainingConfig,
+        datasetio_api: DatasetIO,
+        datasets: Datasets,
+    ) -> None:
+        self.config = config
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets
+        self._scheduler = Scheduler()
+
+    async def shutdown(self) -> None:
+        await self._scheduler.shutdown()
+
+    @staticmethod
+    def _checkpoint_to_artifact(checkpoint: Checkpoint) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.CHECKPOINT.value,
+            name=checkpoint.identifier,
+            uri=checkpoint.path,
+            metadata=dict(checkpoint),
+        )
+
+    @staticmethod
+    def _resources_stats_to_artifact(resources_stats: dict[str, Any]) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.RESOURCES_STATS.value,
+            name=TrainingArtifactType.RESOURCES_STATS.value,
+            metadata=resources_stats,
+        )
+
+    async def supervised_fine_tune(
+        self,
+        job_uuid: str,
+        training_config: TrainingConfig,
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+        model: str,
+        checkpoint_dir: str | None = None,
+        algorithm_config: AlgorithmConfig | None = None,
+    ) -> PostTrainingJob:
+        async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb):
+            on_log_message_cb("Starting HF finetuning")
+
+            recipe = HFFinetuningSingleDevice(
+                job_uuid=job_uuid,
+                datasetio_api=self.datasetio_api,
+                datasets_api=self.datasets_api,
+            )
+
+            resources_allocated, checkpoints = await recipe.train(
+                model=model,
+                output_dir=checkpoint_dir,
+                job_uuid=job_uuid,
+                lora_config=algorithm_config,
+                config=training_config,
+                provider_config=self.config,
+            )
+
+            on_artifact_collected_cb(self._resources_stats_to_artifact(resources_allocated))
+            if checkpoints:
+                for checkpoint in checkpoints:
+                    artifact = self._checkpoint_to_artifact(checkpoint)
+                    on_artifact_collected_cb(artifact)
+
+            on_status_change_cb(SchedulerJobStatus.completed)
+            on_log_message_cb("HF finetuning completed")
+
+        job_uuid = self._scheduler.schedule(_JOB_TYPE_SUPERVISED_FINE_TUNE, job_uuid, handler)
+        return PostTrainingJob(job_uuid=job_uuid)
+
+    async def preference_optimize(
+        self,
+        job_uuid: str,
+        finetuned_model: str,
+        algorithm_config: DPOAlignmentConfig,
+        training_config: TrainingConfig,
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
+    ) -> PostTrainingJob:
+        raise NotImplementedError("DPO alignment is not implemented yet")
+
+    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
+        return ListPostTrainingJobsResponse(
+            data=[PostTrainingJob(job_uuid=job.id) for job in self._scheduler.get_jobs()]
+        )
+
+    @staticmethod
+    def _get_artifacts_metadata_by_type(job, artifact_type):
+        return [artifact.metadata for artifact in job.artifacts if artifact.type == artifact_type]
+
+    @classmethod
+    def _get_checkpoints(cls, job):
+        return cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.CHECKPOINT.value)
+
+    @classmethod
+    def _get_resources_allocated(cls, job):
+        data = cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.RESOURCES_STATS.value)
+        return data[0] if data else None
+
+    @webmethod(route="/post-training/job/status")
+    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse | None:
+        job = self._scheduler.get_job(job_uuid)
+
+        match job.status:
+            # TODO: Add support for other statuses to API
+            case SchedulerJobStatus.new | SchedulerJobStatus.scheduled:
+                status = JobStatus.scheduled
+            case SchedulerJobStatus.running:
+                status = JobStatus.in_progress
+            case SchedulerJobStatus.completed:
+                status = JobStatus.completed
+            case SchedulerJobStatus.failed:
+                status = JobStatus.failed
+            case _:
+                raise NotImplementedError()
+
+        return PostTrainingJobStatusResponse(
+            job_uuid=job_uuid,
+            status=status,
+            scheduled_at=job.scheduled_at,
+            started_at=job.started_at,
+            completed_at=job.completed_at,
+            checkpoints=self._get_checkpoints(job),
+            resources_allocated=self._get_resources_allocated(job),
+        )
+
+    @webmethod(route="/post-training/job/cancel")
+    async def cancel_training_job(self, job_uuid: str) -> None:
+        self._scheduler.cancel(job_uuid)
+
+    @webmethod(route="/post-training/job/artifacts")
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse | None:
+        job = self._scheduler.get_job(job_uuid)
+        return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job))
diff --git a/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
new file mode 100644
index 000000000..b6d13b029
--- /dev/null
+++ b/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
@@ -0,0 +1,683 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import gc
+import json
+import logging
+import multiprocessing
+import os
+import signal
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import psutil
+
+from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
+
+# Set tokenizer parallelism environment variable
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Force PyTorch to use OpenBLAS instead of MKL
+os.environ["MKL_THREADING_LAYER"] = "GNU"
+os.environ["MKL_SERVICE_FORCE_INTEL"] = "0"
+os.environ["MKL_NUM_THREADS"] = "1"
+
+import torch
+from datasets import Dataset
+from peft import LoraConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+from trl import SFTConfig, SFTTrainer
+
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.post_training import (
+    Checkpoint,
+    DataConfig,
+    LoraFinetuningConfig,
+    TrainingConfig,
+)
+
+from ..config import HuggingFacePostTrainingConfig
+
+logger = logging.getLogger(__name__)
+
+
+def get_gb(to_convert: int) -> str:
+    """Converts memory stats to GB and formats to 2 decimal places.
+    Args:
+        to_convert: Memory value in bytes
+    Returns:
+        str: Memory value in GB formatted to 2 decimal places
+    """
+    return f"{(to_convert / (1024**3)):.2f}"
+
+
+def get_memory_stats(device: torch.device) -> dict[str, Any]:
+    """Get memory statistics for the given device."""
+    stats = {
+        "system_memory": {
+            "total": get_gb(psutil.virtual_memory().total),
+            "available": get_gb(psutil.virtual_memory().available),
+            "used": get_gb(psutil.virtual_memory().used),
+            "percent": psutil.virtual_memory().percent,
+        }
+    }
+
+    if device.type == "cuda":
+        stats["device_memory"] = {
+            "allocated": get_gb(torch.cuda.memory_allocated(device)),
+            "reserved": get_gb(torch.cuda.memory_reserved(device)),
+            "max_allocated": get_gb(torch.cuda.max_memory_allocated(device)),
+        }
+    elif device.type == "mps":
+        # MPS doesn't provide direct memory stats, but we can track system memory
+        stats["device_memory"] = {
+            "note": "MPS memory stats not directly available",
+            "system_memory_used": get_gb(psutil.virtual_memory().used),
+        }
+    elif device.type == "cpu":
+        # For CPU, we track process memory usage
+        process = psutil.Process()
+        stats["device_memory"] = {
+            "process_rss": get_gb(process.memory_info().rss),
+            "process_vms": get_gb(process.memory_info().vms),
+            "process_percent": process.memory_percent(),
+        }
+
+    return stats
+
+
+def setup_torch_device(device_str: str) -> torch.device:
+    """Initialize and validate a PyTorch device.
+    This function handles device initialization and validation for different device types:
+    - CUDA: Validates CUDA availability and handles device selection
+    - MPS: Validates MPS availability for Apple Silicon
+    - CPU: Basic validation
+    - HPU: Raises error as it's not supported
+    Args:
+        device_str: String specifying the device ('cuda', 'cpu', 'mps')
+    Returns:
+        torch.device: The initialized and validated device
+    Raises:
+        RuntimeError: If device initialization fails or device is not supported
+    """
+    try:
+        device = torch.device(device_str)
+    except RuntimeError as e:
+        raise RuntimeError(f"Error getting Torch Device {str(e)}") from e
+
+    # Validate device capabilities
+    if device.type == "cuda":
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                f"{device.type}: Torch has no CUDA/ROCm support or could not detect a compatible device."
+            )
+        if device.index is None:
+            device = torch.device(device.type, torch.cuda.current_device())
+    elif device.type == "mps":
+        if not torch.backends.mps.is_available():
+            raise RuntimeError(f"{device.type}: Torch has no MPS support or could not detect a compatible device.")
+    elif device.type == "hpu":
+        raise RuntimeError(f"{device.type}: training does not support Intel Gaudi.")
+
+    return device
+
+
+class HFFinetuningSingleDevice:
+    def __init__(
+        self,
+        job_uuid: str,
+        datasetio_api: DatasetIO,
+        datasets_api: Datasets,
+    ):
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets_api
+        self.job_uuid = job_uuid
+
+    def validate_dataset_format(self, rows: list[dict]) -> bool:
+        """Validate that the dataset has the required fields."""
+        required_fields = ["input_query", "expected_answer", "chat_completion_input"]
+        return all(field in row for row in rows for field in required_fields)
+
+    def _process_instruct_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row in instruct format."""
+        if "chat_completion_input" in row and "expected_answer" in row:
+            try:
+                messages = json.loads(row["chat_completion_input"])
+                if not isinstance(messages, list) or len(messages) != 1:
+                    logger.warning(f"Invalid chat_completion_input format: {row['chat_completion_input']}")
+                    return None, None
+                if "content" not in messages[0]:
+                    logger.warning(f"Message missing content: {messages[0]}")
+                    return None, None
+                return messages[0]["content"], row["expected_answer"]
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse chat_completion_input: {row['chat_completion_input']}")
+                return None, None
+        return None, None
+
+    def _process_dialog_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row in dialog format."""
+        if "dialog" in row:
+            try:
+                dialog = json.loads(row["dialog"])
+                if not isinstance(dialog, list) or len(dialog) < 2:
+                    logger.warning(f"Dialog must have at least 2 messages: {row['dialog']}")
+                    return None, None
+                if dialog[0].get("role") != "user":
+                    logger.warning(f"First message must be from user: {dialog[0]}")
+                    return None, None
+                if not any(msg.get("role") == "assistant" for msg in dialog):
+                    logger.warning("Dialog must have at least one assistant message")
+                    return None, None
+
+                # Convert to human/gpt format
+                role_map = {"user": "human", "assistant": "gpt"}
+                conversations = []
+                for msg in dialog:
+                    if "role" not in msg or "content" not in msg:
+                        logger.warning(f"Message missing role or content: {msg}")
+                        continue
+                    conversations.append({"from": role_map[msg["role"]], "value": msg["content"]})
+
+                # Format as a single conversation
+                return conversations[0]["value"], conversations[1]["value"]
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse dialog: {row['dialog']}")
+                return None, None
+        return None, None
+
+    def _process_fallback_format(self, row: dict) -> tuple[str | None, str | None]:
+        """Process a row using fallback formats."""
+        if "input" in row and "output" in row:
+            return row["input"], row["output"]
+        elif "prompt" in row and "completion" in row:
+            return row["prompt"], row["completion"]
+        elif "question" in row and "answer" in row:
+            return row["question"], row["answer"]
+        return None, None
+
+    def _format_text(self, input_text: str, output_text: str, provider_config: HuggingFacePostTrainingConfig) -> str:
+        """Format input and output text based on model requirements."""
+        if hasattr(provider_config, "chat_template"):
+            return provider_config.chat_template.format(input=input_text, output=output_text)
+        return f"{input_text}\n{output_text}"
+
+    def _create_dataset(
+        self, rows: list[dict], config: TrainingConfig, provider_config: HuggingFacePostTrainingConfig
+    ) -> Dataset:
+        """Create and preprocess the dataset."""
+        formatted_rows = []
+        for row in rows:
+            input_text = None
+            output_text = None
+
+            # Process based on format
+            assert isinstance(config.data_config, DataConfig), "DataConfig must be initialized"
+            if config.data_config.data_format.value == "instruct":
+                input_text, output_text = self._process_instruct_format(row)
+            elif config.data_config.data_format.value == "dialog":
+                input_text, output_text = self._process_dialog_format(row)
+            else:
+                input_text, output_text = self._process_fallback_format(row)
+
+            if input_text and output_text:
+                formatted_text = self._format_text(input_text, output_text, provider_config)
+                formatted_rows.append({"text": formatted_text})
+
+        if not formatted_rows:
+            assert isinstance(config.data_config, DataConfig), "DataConfig must be initialized"
+            raise ValueError(
+                f"No valid input/output pairs found in the dataset for format: {config.data_config.data_format.value}"
+            )
+
+        return Dataset.from_list(formatted_rows)
+
+    def _preprocess_dataset(
+        self, ds: Dataset, tokenizer: AutoTokenizer, provider_config: HuggingFacePostTrainingConfig
+    ) -> Dataset:
+        """Preprocess the dataset with tokenizer."""
+
+        def tokenize_function(examples):
+            return tokenizer(
+                examples["text"],
+                padding=True,
+                truncation=True,
+                max_length=provider_config.max_seq_length,
+                return_tensors=None,
+            )
+
+        return ds.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=ds.column_names,
+        )
+
+    async def _setup_data(self, dataset_id: str) -> list[dict[str, Any]]:
+        """Load dataset from llama stack dataset provider"""
+        try:
+            all_rows = await self.datasetio_api.iterrows(
+                dataset_id=dataset_id,
+                limit=-1,
+            )
+            if not isinstance(all_rows.data, list):
+                raise RuntimeError("Expected dataset data to be a list")
+            return all_rows.data
+        except Exception as e:
+            raise RuntimeError(f"Failed to load dataset: {str(e)}") from e
+
+    def _run_training_sync(
+        self,
+        model: str,
+        provider_config: dict[str, Any],
+        peft_config: LoraConfig | None,
+        config: dict[str, Any],
+        output_dir_path: Path | None,
+    ) -> None:
+        """Synchronous wrapper for running training process.
+        This method serves as a bridge between the multiprocessing Process and the async training function.
+        It creates a new event loop to run the async training process.
+        Args:
+            model: The model identifier to load
+            dataset_id: ID of the dataset to use for training
+            provider_config: Configuration specific to the HuggingFace provider
+            peft_config: Optional LoRA configuration
+            config: General training configuration
+            output_dir_path: Optional path to save the model
+        """
+        import asyncio
+
+        logger.info("Starting training process with async wrapper")
+        asyncio.run(
+            self._run_training(
+                model=model,
+                provider_config=provider_config,
+                peft_config=peft_config,
+                config=config,
+                output_dir_path=output_dir_path,
+            )
+        )
+
+    async def load_dataset(
+        self,
+        model: str,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> tuple[Dataset, Dataset, AutoTokenizer]:
+        """Load and prepare the dataset for training.
+        Args:
+            model: The model identifier to load
+            config: Training configuration
+            provider_config: Provider-specific configuration
+        Returns:
+            tuple: (train_dataset, eval_dataset, tokenizer)
+        """
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+
+        # Load dataset
+        logger.info(f"Loading dataset: {config.data_config.dataset_id}")
+        rows = await self._setup_data(config.data_config.dataset_id)
+        if not self.validate_dataset_format(rows):
+            raise ValueError("Dataset is missing required fields: input_query, expected_answer, chat_completion_input")
+        logger.info(f"Loaded {len(rows)} rows from dataset")
+
+        # Initialize tokenizer
+        logger.info(f"Initializing tokenizer for model: {model}")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model, **provider_config.model_specific_config)
+
+            # Set pad token to eos token if not present
+            # This is common for models that don't have a dedicated pad token
+            if not tokenizer.pad_token:
+                tokenizer.pad_token = tokenizer.eos_token
+
+            # Set padding side to right for causal language modeling
+            # This ensures that padding tokens don't interfere with the model's ability
+            # to predict the next token in the sequence
+            tokenizer.padding_side = "right"
+
+            # Set truncation side to right to keep the beginning of the sequence
+            # This is important for maintaining context and instruction format
+            tokenizer.truncation_side = "right"
+
+            # Set model max length to match provider config
+            # This ensures consistent sequence lengths across the training process
+            tokenizer.model_max_length = provider_config.max_seq_length
+
+            logger.info("Tokenizer initialized successfully")
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize tokenizer: {str(e)}") from e
+
+        # Create and preprocess dataset
+        logger.info("Creating and preprocessing dataset")
+        try:
+            ds = self._create_dataset(rows, config, provider_config)
+            ds = self._preprocess_dataset(ds, tokenizer, provider_config)
+            logger.info(f"Dataset created with {len(ds)} examples")
+        except Exception as e:
+            raise ValueError(f"Failed to create dataset: {str(e)}") from e
+
+        # Split dataset
+        logger.info("Splitting dataset into train and validation sets")
+        train_val_split = ds.train_test_split(test_size=0.1, seed=42)
+        train_dataset = train_val_split["train"]
+        eval_dataset = train_val_split["test"]
+        logger.info(f"Split dataset into {len(train_dataset)} training and {len(eval_dataset)} validation examples")
+
+        return train_dataset, eval_dataset, tokenizer
+
+    def load_model(
+        self,
+        model: str,
+        device: torch.device,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> AutoModelForCausalLM:
+        """Load and initialize the model for training.
+        Args:
+            model: The model identifier to load
+            device: The device to load the model onto
+            provider_config: Provider-specific configuration
+        Returns:
+            The loaded and initialized model
+        Raises:
+            RuntimeError: If model loading fails
+        """
+        logger.info("Loading the base model")
+        try:
+            model_config = AutoConfig.from_pretrained(model, **provider_config.model_specific_config)
+            model_obj = AutoModelForCausalLM.from_pretrained(
+                model,
+                torch_dtype="auto" if device.type != "cpu" else "float32",
+                quantization_config=None,
+                config=model_config,
+                **provider_config.model_specific_config,
+            )
+            # Always move model to specified device
+            model_obj = model_obj.to(device)
+            logger.info(f"Model loaded and moved to device: {model_obj.device}")
+            return model_obj
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model: {str(e)}") from e
+
+    def setup_training_args(
+        self,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+        device: torch.device,
+        output_dir_path: Path | None,
+        steps_per_epoch: int,
+    ) -> SFTConfig:
+        """Setup training arguments.
+        Args:
+            config: Training configuration
+            provider_config: Provider-specific configuration
+            device: The device to train on
+            output_dir_path: Optional path to save the model
+            steps_per_epoch: Number of steps per epoch
+        Returns:
+            Configured SFTConfig object
+        """
+        logger.info("Configuring training arguments")
+        lr = 2e-5
+        if config.optimizer_config:
+            lr = config.optimizer_config.lr
+            logger.info(f"Using custom learning rate: {lr}")
+
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+        data_config = config.data_config
+
+        # Calculate steps
+        total_steps = steps_per_epoch * config.n_epochs
+        max_steps = min(config.max_steps_per_epoch, total_steps)
+        eval_steps = max(1, steps_per_epoch // 10)  # Evaluate 10 times per epoch
+        save_steps = max(1, steps_per_epoch // 5)  # Save 5 times per epoch
+        logging_steps = max(1, steps_per_epoch // 50)  # Log 50 times per epoch
+
+        logger.info("Training configuration:")
+        logger.info(f"- Steps per epoch: {steps_per_epoch}")
+        logger.info(f"- Total steps: {total_steps}")
+        logger.info(f"- Max steps: {max_steps}")
+        logger.info(f"- Eval steps: {eval_steps}")
+        logger.info(f"- Save steps: {save_steps}")
+        logger.info(f"- Logging steps: {logging_steps}")
+
+        # Configure save strategy
+        save_strategy = "no"
+        if output_dir_path:
+            save_strategy = "steps"
+            logger.info(f"Will save checkpoints to {output_dir_path}")
+
+        return SFTConfig(
+            max_steps=max_steps,
+            output_dir=str(output_dir_path) if output_dir_path is not None else None,
+            num_train_epochs=config.n_epochs,
+            per_device_train_batch_size=data_config.batch_size,
+            fp16=device.type == "cuda",
+            bf16=False,  # Causes CPU issues.
+            eval_strategy="steps",
+            use_cpu=True if device.type == "cpu" and not torch.backends.mps.is_available() else False,
+            save_strategy=save_strategy,
+            report_to="none",
+            max_seq_length=provider_config.max_seq_length,
+            gradient_accumulation_steps=config.gradient_accumulation_steps,
+            gradient_checkpointing=provider_config.gradient_checkpointing,
+            learning_rate=lr,
+            warmup_ratio=provider_config.warmup_ratio,
+            weight_decay=provider_config.weight_decay,
+            remove_unused_columns=False,
+            dataloader_pin_memory=provider_config.dataloader_pin_memory,
+            dataloader_num_workers=provider_config.dataloader_num_workers,
+            dataset_text_field="text",
+            packing=False,
+            load_best_model_at_end=True if output_dir_path else False,
+            metric_for_best_model="eval_loss",
+            greater_is_better=False,
+            eval_steps=eval_steps,
+            save_steps=save_steps,
+            logging_steps=logging_steps,
+        )
+
+    def save_model(
+        self,
+        model_obj: AutoModelForCausalLM,
+        trainer: SFTTrainer,
+        peft_config: LoraConfig | None,
+        output_dir_path: Path,
+    ) -> None:
+        """Save the trained model.
+        Args:
+            model_obj: The model to save
+            trainer: The trainer instance
+            peft_config: Optional LoRA configuration
+            output_dir_path: Path to save the model
+        """
+        logger.info("Saving final model")
+        model_obj.config.use_cache = True
+
+        if peft_config:
+            logger.info("Merging LoRA weights with base model")
+            model_obj = trainer.model.merge_and_unload()
+        else:
+            model_obj = trainer.model
+
+        save_path = output_dir_path / "merged_model"
+        logger.info(f"Saving model to {save_path}")
+        model_obj.save_pretrained(save_path)
+
+    async def _run_training(
+        self,
+        model: str,
+        provider_config: dict[str, Any],
+        peft_config: LoraConfig | None,
+        config: dict[str, Any],
+        output_dir_path: Path | None,
+    ) -> None:
+        """Run the training process with signal handling."""
+
+        def signal_handler(signum, frame):
+            """Handle termination signals gracefully."""
+            logger.info(f"Received signal {signum}, initiating graceful shutdown")
+            sys.exit(0)
+
+        signal.signal(signal.SIGTERM, signal_handler)
+        signal.signal(signal.SIGINT, signal_handler)
+
+        # Convert config dicts back to objects
+        logger.info("Initializing configuration objects")
+        provider_config_obj = HuggingFacePostTrainingConfig(**provider_config)
+        config_obj = TrainingConfig(**config)
+
+        # Initialize and validate device
+        device = setup_torch_device(provider_config_obj.device)
+        logger.info(f"Using device '{device}'")
+
+        # Load dataset and tokenizer
+        train_dataset, eval_dataset, tokenizer = await self.load_dataset(model, config_obj, provider_config_obj)
+
+        # Calculate steps per epoch
+        if not config_obj.data_config:
+            raise ValueError("DataConfig is required for training")
+        steps_per_epoch = len(train_dataset) // config_obj.data_config.batch_size
+
+        # Setup training arguments
+        training_args = self.setup_training_args(
+            config_obj,
+            provider_config_obj,
+            device,
+            output_dir_path,
+            steps_per_epoch,
+        )
+
+        # Load model
+        model_obj = self.load_model(model, device, provider_config_obj)
+
+        # Initialize trainer
+        logger.info("Initializing SFTTrainer")
+        trainer = SFTTrainer(
+            model=model_obj,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            peft_config=peft_config,
+            args=training_args,
+        )
+
+        try:
+            # Train
+            logger.info("Starting training")
+            trainer.train()
+            logger.info("Training completed successfully")
+
+            # Save final model if output directory is provided
+            if output_dir_path:
+                self.save_model(model_obj, trainer, peft_config, output_dir_path)
+
+        finally:
+            # Clean up resources
+            logger.info("Cleaning up resources")
+            if hasattr(trainer, "model"):
+                evacuate_model_from_device(trainer.model, device.type)
+            del trainer
+            gc.collect()
+            logger.info("Cleanup completed")
+
+    async def train(
+        self,
+        model: str,
+        output_dir: str | None,
+        job_uuid: str,
+        lora_config: LoraFinetuningConfig,
+        config: TrainingConfig,
+        provider_config: HuggingFacePostTrainingConfig,
+    ) -> tuple[dict[str, Any], list[Checkpoint] | None]:
+        """Train a model using HuggingFace's SFTTrainer"""
+        # Initialize and validate device
+        device = setup_torch_device(provider_config.device)
+        logger.info(f"Using device '{device}'")
+
+        output_dir_path = None
+        if output_dir:
+            output_dir_path = Path(output_dir)
+
+        # Track memory stats
+        memory_stats = {
+            "initial": get_memory_stats(device),
+            "after_training": None,
+            "final": None,
+        }
+
+        # Configure LoRA
+        peft_config = None
+        if lora_config:
+            peft_config = LoraConfig(
+                lora_alpha=lora_config.alpha,
+                lora_dropout=0.1,
+                r=lora_config.rank,
+                bias="none",
+                task_type="CAUSAL_LM",
+                target_modules=lora_config.lora_attn_modules,
+            )
+
+        # Validate data config
+        if not config.data_config:
+            raise ValueError("DataConfig is required for training")
+
+        # Train in a separate process
+        logger.info("Starting training in separate process")
+        try:
+            # Set multiprocessing start method to 'spawn' for CUDA/MPS compatibility
+            if device.type in ["cuda", "mps"]:
+                multiprocessing.set_start_method("spawn", force=True)
+
+            process = multiprocessing.Process(
+                target=self._run_training_sync,
+                kwargs={
+                    "model": model,
+                    "provider_config": provider_config.model_dump(),
+                    "peft_config": peft_config,
+                    "config": config.model_dump(),
+                    "output_dir_path": output_dir_path,
+                },
+            )
+            process.start()
+
+            # Monitor the process
+            while process.is_alive():
+                process.join(timeout=1)  # Check every second
+                if not process.is_alive():
+                    break
+
+            # Get the return code
+            if process.exitcode != 0:
+                raise RuntimeError(f"Training failed with exit code {process.exitcode}")
+
+            memory_stats["after_training"] = get_memory_stats(device)
+
+            checkpoints = None
+            if output_dir_path:
+                # Create checkpoint
+                checkpoint = Checkpoint(
+                    identifier=f"{model}-sft-{config.n_epochs}",
+                    created_at=datetime.now(timezone.utc),
+                    epoch=config.n_epochs,
+                    post_training_job_id=job_uuid,
+                    path=str(output_dir_path / "merged_model"),
+                )
+                checkpoints = [checkpoint]
+
+            return memory_stats, checkpoints
+        finally:
+            memory_stats["final"] = get_memory_stats(device)
+            gc.collect()
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index 1239523cd..f56dd2499 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import gc
 import logging
 import os
 import time
@@ -39,7 +38,6 @@ from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
     Checkpoint,
     DataConfig,
-    EfficiencyConfig,
     LoraFinetuningConfig,
     OptimizerConfig,
     QATFinetuningConfig,
@@ -48,6 +46,7 @@ from llama_stack.apis.post_training import (
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device
 from llama_stack.providers.inline.post_training.torchtune.common import utils
 from llama_stack.providers.inline.post_training.torchtune.common.checkpointer import (
     TorchtuneCheckpointer,
@@ -90,8 +89,6 @@ class LoraFinetuningSingleDevice:
     ) -> None:
         assert isinstance(training_config.data_config, DataConfig), "DataConfig must be initialized"
 
-        assert isinstance(training_config.efficiency_config, EfficiencyConfig), "EfficiencyConfig must be initialized"
-
         self.job_uuid = job_uuid
         self.training_config = training_config
         if not isinstance(algorithm_config, LoraFinetuningConfig):
@@ -557,11 +554,7 @@ class LoraFinetuningSingleDevice:
             checkpoints.append(checkpoint)
 
         # clean up the memory after training finishes
-        if self._device.type != "cpu":
-            self._model.to("cpu")
-            torch.cuda.empty_cache()
-        del self._model
-        gc.collect()
+        evacuate_model_from_device(self._model, self._device.type)
 
         return (memory_stats, checkpoints)
 
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index 9295d5cab..67362dd36 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -20,7 +20,10 @@ from opentelemetry.semconv.resource import ResourceAttributes
 from llama_stack.apis.telemetry import (
     Event,
     MetricEvent,
+    MetricLabelMatcher,
+    MetricQueryType,
     QueryCondition,
+    QueryMetricsResponse,
     QuerySpanTreeResponse,
     QueryTracesResponse,
     Span,
@@ -123,6 +126,17 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
         else:
             raise ValueError(f"Unknown event type: {event}")
 
+    async def query_metrics(
+        self,
+        metric_name: str,
+        start_time: int,
+        end_time: int | None = None,
+        granularity: str | None = "1d",
+        query_type: MetricQueryType = MetricQueryType.RANGE,
+        label_matchers: list[MetricLabelMatcher] | None = None,
+    ) -> QueryMetricsResponse:
+        raise NotImplementedError("Querying metrics is not implemented")
+
     def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
         with self._lock:
             # Use global storage instead of instance storage
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index df0257718..39f752297 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -87,6 +87,7 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                     content,
                     chunk_size_in_tokens,
                     chunk_size_in_tokens // 4,
+                    doc.metadata,
                 )
             )
 
@@ -105,7 +106,9 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
         query_config: RAGQueryConfig | None = None,
     ) -> RAGQueryResult:
         if not vector_db_ids:
-            return RAGQueryResult(content=None)
+            raise ValueError(
+                "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
+            )
 
         query_config = query_config or RAGQueryConfig()
         query = await generate_rag_query(
@@ -140,19 +143,21 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                 text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
             )
         ]
-        for i, c in enumerate(chunks):
-            metadata = c.metadata
+        for i, chunk in enumerate(chunks):
+            metadata = chunk.metadata
             tokens += metadata["token_count"]
+            tokens += metadata["metadata_token_count"]
+
             if tokens > query_config.max_tokens_in_context:
                 log.error(
                     f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
                 )
                 break
-            picked.append(
-                TextContentItem(
-                    text=f"Result {i + 1}:\nDocument_id:{metadata['document_id'][:5]}\nContent: {c.content}\n",
-                )
-            )
+
+            metadata_subset = {k: v for k, v in metadata.items() if k not in ["token_count", "metadata_token_count"]}
+            text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_subset)
+            picked.append(TextContentItem(text=text_content))
+
         picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
         picked.append(
             TextContentItem(
diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 5d5b1da35..d3dc7e694 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -125,7 +125,7 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
         # Load existing banks from kvstore
         start_key = VECTOR_DBS_PREFIX
         end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_dbs = await self.kvstore.range(start_key, end_key)
+        stored_vector_dbs = await self.kvstore.values_in_range(start_key, end_key)
 
         for vector_db_data in stored_vector_dbs:
             vector_db = VectorDB.model_validate_json(vector_db_data)
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index b0abc1818..7b49ef09b 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -280,11 +280,10 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="sambanova",
-                pip_packages=[
-                    "openai",
-                ],
+                pip_packages=["litellm"],
                 module="llama_stack.providers.remote.inference.sambanova",
                 config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
             ),
         ),
         remote_provider_spec(
diff --git a/llama_stack/providers/registry/post_training.py b/llama_stack/providers/registry/post_training.py
index 35567c07d..d752b8819 100644
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@@ -21,6 +21,17 @@ def available_providers() -> list[ProviderSpec]:
                 Api.datasets,
             ],
         ),
+        InlineProviderSpec(
+            api=Api.post_training,
+            provider_type="inline::huggingface",
+            pip_packages=["torch", "trl", "transformers", "peft", "datasets"],
+            module="llama_stack.providers.inline.post_training.huggingface",
+            config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
+            api_dependencies=[
+                Api.datasetio,
+                Api.datasets,
+            ],
+        ),
         remote_provider_spec(
             api=Api.post_training,
             adapter=AdapterSpec(
diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
index baecf45e9..fafd1d8ff 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@@ -12,8 +12,8 @@ from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.pagination import paginate_records
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.pagination import paginate_records
 
 from .config import HuggingfaceDatasetIOConfig
 
@@ -42,7 +42,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
         # Load existing datasets from kvstore
         start_key = DATASETS_PREFIX
         end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.range(start_key, end_key)
+        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
 
         for dataset in stored_datasets:
             dataset = Dataset.model_validate_json(dataset)
diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
index a5f07edd2..523a8dfe7 100644
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import CerebrasCompatConfig
 
 
-async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .cerebras import CerebrasCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
index f78f218b5..15a666cb6 100644
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import FireworksCompatConfig
 
 
-async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .fireworks import FireworksCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
index 8161df20d..794cdebd7 100644
--- a/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import GroqCompatConfig
 
 
-async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .groq import GroqCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py b/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py
index a6fb37cad..be48d1067 100644
--- a/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import LlamaCompatConfig
 
 
-async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .llama import LlamaCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 0cf63097b..3b4287673 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
     EmbeddingsResponse,
     EmbeddingTaskType,
     GrammarResponseFormat,
-    Inference,
+    InferenceProvider,
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
@@ -61,6 +61,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
     get_sampling_options,
+    prepare_openai_completion_params,
     process_chat_completion_response,
     process_chat_completion_stream_response,
     process_completion_response,
@@ -81,7 +82,7 @@ logger = get_logger(name=__name__, category="inference")
 
 
 class OllamaInferenceAdapter(
-    Inference,
+    InferenceProvider,
     ModelsProtocolPrivate,
 ):
     def __init__(self, url: str) -> None:
@@ -139,6 +140,8 @@ class OllamaInferenceAdapter(
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = CompletionRequest(
             model=model.provider_resource_id,
             content=content,
@@ -202,6 +205,8 @@ class OllamaInferenceAdapter(
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = ChatCompletionRequest(
             model=model.provider_resource_id,
             messages=messages,
@@ -346,6 +351,8 @@ class OllamaInferenceAdapter(
         #  - models not currently running are run by the ollama server as needed
         response = await self.client.list()
         available_models = [m["model"] for m in response["models"]]
+        if model.provider_resource_id is None:
+            raise ValueError("Model provider_resource_id cannot be None")
         provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
         if provider_resource_id is None:
             provider_resource_id = model.provider_resource_id
@@ -389,29 +396,25 @@ class OllamaInferenceAdapter(
             raise ValueError("Ollama does not support non-string prompts for completion")
 
         model_obj = await self._get_model(model)
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "prompt": prompt,
-                "best_of": best_of,
-                "echo": echo,
-                "frequency_penalty": frequency_penalty,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_tokens": max_tokens,
-                "n": n,
-                "presence_penalty": presence_penalty,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.completions.create(**params)  # type: ignore
 
     async def openai_chat_completion(
@@ -441,41 +444,31 @@ class OllamaInferenceAdapter(
         user: str | None = None,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         model_obj = await self._get_model(model)
-
-        # ollama still makes tool calls even when tool_choice is "none"
-        # so we need to remove the tools in that case
-        if tool_choice == "none" and tools is not None:
-            tools = None
-
-        params = {
-            k: v
-            for k, v in {
-                "model": model_obj.provider_resource_id,
-                "messages": messages,
-                "frequency_penalty": frequency_penalty,
-                "function_call": function_call,
-                "functions": functions,
-                "logit_bias": logit_bias,
-                "logprobs": logprobs,
-                "max_completion_tokens": max_completion_tokens,
-                "max_tokens": max_tokens,
-                "n": n,
-                "parallel_tool_calls": parallel_tool_calls,
-                "presence_penalty": presence_penalty,
-                "response_format": response_format,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream,
-                "stream_options": stream_options,
-                "temperature": temperature,
-                "tool_choice": tool_choice,
-                "tools": tools,
-                "top_logprobs": top_logprobs,
-                "top_p": top_p,
-                "user": user,
-            }.items()
-            if v is not None
-        }
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
         return await self.openai_client.chat.completions.create(**params)  # type: ignore
 
     async def batch_completion(
diff --git a/llama_stack/providers/remote/inference/openai/models.py b/llama_stack/providers/remote/inference/openai/models.py
index 1737043a4..e029c456c 100644
--- a/llama_stack/providers/remote/inference/openai/models.py
+++ b/llama_stack/providers/remote/inference/openai/models.py
@@ -4,27 +4,60 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from dataclasses import dataclass
+
 from llama_stack.apis.models.models import ModelType
 from llama_stack.providers.utils.inference.model_registry import (
     ProviderModelEntry,
 )
 
 LLM_MODEL_IDS = [
+    # the models w/ "openai/" prefix are the litellm specific model names.
+    # they should be deprecated in favor of the canonical openai model names.
     "openai/gpt-4o",
     "openai/gpt-4o-mini",
     "openai/chatgpt-4o-latest",
+    "gpt-3.5-turbo-0125",
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-instruct",
+    "gpt-4",
+    "gpt-4-turbo",
+    "gpt-4o",
+    "gpt-4o-2024-08-06",
+    "gpt-4o-mini",
+    "gpt-4o-audio-preview",
+    "chatgpt-4o-latest",
+    "o1",
+    "o1-mini",
+    "o3-mini",
+    "o4-mini",
 ]
 
 
+@dataclass
+class EmbeddingModelInfo:
+    """Structured representation of embedding model information."""
+
+    embedding_dimension: int
+    context_length: int
+
+
+EMBEDDING_MODEL_IDS: dict[str, EmbeddingModelInfo] = {
+    "openai/text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
+    "openai/text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
+    "text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
+    "text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
+}
+
+
 MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + [
     ProviderModelEntry(
-        provider_model_id="openai/text-embedding-3-small",
+        provider_model_id=model_id,
         model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 1536, "context_length": 8192},
-    ),
-    ProviderModelEntry(
-        provider_model_id="openai/text-embedding-3-large",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 3072, "context_length": 8192},
-    ),
+        metadata={
+            "embedding_dimension": model_info.embedding_dimension,
+            "context_length": model_info.context_length,
+        },
+    )
+    for model_id, model_info in EMBEDDING_MODEL_IDS.items()
 ]
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 6b9c02e6c..9a1ec7ee0 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -4,12 +4,41 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import logging
+from collections.abc import AsyncIterator
+from typing import Any
+
+from openai import AsyncOpenAI
+
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 
 from .config import OpenAIConfig
 from .models import MODEL_ENTRIES
 
+logger = logging.getLogger(__name__)
 
+
+#
+# This OpenAI adapter implements Inference methods using two clients -
+#
+# | Inference Method           | Implementation Source    |
+# |----------------------------|--------------------------|
+# | completion                 | LiteLLMOpenAIMixin       |
+# | chat_completion            | LiteLLMOpenAIMixin       |
+# | embedding                  | LiteLLMOpenAIMixin       |
+# | batch_completion           | LiteLLMOpenAIMixin       |
+# | batch_chat_completion      | LiteLLMOpenAIMixin       |
+# | openai_completion          | AsyncOpenAI              |
+# | openai_chat_completion     | AsyncOpenAI              |
+#
 class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
     def __init__(self, config: OpenAIConfig) -> None:
         LiteLLMOpenAIMixin.__init__(
@@ -19,9 +48,120 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
             provider_data_api_key_field="openai_api_key",
         )
         self.config = config
+        # we set is_openai_compat so users can use the canonical
+        # openai model names like "gpt-4" or "gpt-3.5-turbo"
+        # and the model name will be translated to litellm's
+        # "openai/gpt-4" or "openai/gpt-3.5-turbo" transparently.
+        # if we do not set this, users will be exposed to the
+        # litellm specific model names, an abstraction leak.
+        self.is_openai_compat = True
+        self._openai_client = AsyncOpenAI(
+            api_key=self.config.api_key,
+        )
 
     async def initialize(self) -> None:
         await super().initialize()
 
     async def shutdown(self) -> None:
         await super().shutdown()
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
+    ) -> OpenAICompletion:
+        if guided_choice is not None:
+            logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
+        if prompt_logprobs is not None:
+            logging.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
+
+        params = await prepare_openai_completion_params(
+            model=(await self.model_store.get_model(model)).provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._openai_client.completions.create(**params)
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
+        params = await prepare_openai_completion_params(
+            model=(await self.model_store.get_model(model)).provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._openai_client.chat.completions.create(**params)
diff --git a/llama_stack/providers/remote/inference/sambanova/__init__.py b/llama_stack/providers/remote/inference/sambanova/__init__.py
index 3e682e69c..a3a7b8fbd 100644
--- a/llama_stack/providers/remote/inference/sambanova/__init__.py
+++ b/llama_stack/providers/remote/inference/sambanova/__init__.py
@@ -4,16 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from pydantic import BaseModel
+from llama_stack.apis.inference import Inference
 
 from .config import SambaNovaImplConfig
 
 
-class SambaNovaProviderDataValidator(BaseModel):
-    sambanova_api_key: str
-
-
-async def get_adapter_impl(config: SambaNovaImplConfig, _deps):
+async def get_adapter_impl(config: SambaNovaImplConfig, _deps) -> Inference:
     from .sambanova import SambaNovaInferenceAdapter
 
     assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/remote/inference/sambanova/config.py b/llama_stack/providers/remote/inference/sambanova/config.py
index 8ca11de78..abbf9430f 100644
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@@ -6,25 +6,32 @@
 
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, SecretStr
 
 from llama_stack.schema_utils import json_schema_type
 
 
+class SambaNovaProviderDataValidator(BaseModel):
+    sambanova_api_key: str | None = Field(
+        default=None,
+        description="Sambanova Cloud API key",
+    )
+
+
 @json_schema_type
 class SambaNovaImplConfig(BaseModel):
     url: str = Field(
         default="https://api.sambanova.ai/v1",
         description="The URL for the SambaNova AI server",
     )
-    api_key: str | None = Field(
+    api_key: SecretStr | None = Field(
         default=None,
-        description="The SambaNova.ai API Key",
+        description="The SambaNova cloud API Key",
     )
 
     @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
         return {
             "url": "https://api.sambanova.ai/v1",
-            "api_key": "${env.SAMBANOVA_API_KEY}",
+            "api_key": api_key,
         }
diff --git a/llama_stack/providers/remote/inference/sambanova/models.py b/llama_stack/providers/remote/inference/sambanova/models.py
index 43041e94a..9954fa7a0 100644
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@@ -11,43 +11,43 @@ from llama_stack.providers.utils.inference.model_registry import (
 
 MODEL_ENTRIES = [
     build_hf_repo_model_entry(
-        "Meta-Llama-3.1-8B-Instruct",
+        "sambanova/Meta-Llama-3.1-8B-Instruct",
         CoreModelId.llama3_1_8b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.1-70B-Instruct",
-        CoreModelId.llama3_1_70b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "Meta-Llama-3.1-405B-Instruct",
+        "sambanova/Meta-Llama-3.1-405B-Instruct",
         CoreModelId.llama3_1_405b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.2-1B-Instruct",
+        "sambanova/Meta-Llama-3.2-1B-Instruct",
         CoreModelId.llama3_2_1b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.2-3B-Instruct",
+        "sambanova/Meta-Llama-3.2-3B-Instruct",
         CoreModelId.llama3_2_3b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-3.3-70B-Instruct",
+        "sambanova/Meta-Llama-3.3-70B-Instruct",
         CoreModelId.llama3_3_70b_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Llama-3.2-11B-Vision-Instruct",
+        "sambanova/Llama-3.2-11B-Vision-Instruct",
         CoreModelId.llama3_2_11b_vision_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Llama-3.2-90B-Vision-Instruct",
+        "sambanova/Llama-3.2-90B-Vision-Instruct",
         CoreModelId.llama3_2_90b_vision_instruct.value,
     ),
     build_hf_repo_model_entry(
-        "Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-    build_hf_repo_model_entry(
-        "Llama-4-Scout-17B-16E-Instruct",
+        "sambanova/Llama-4-Scout-17B-16E-Instruct",
         CoreModelId.llama4_scout_17b_16e_instruct.value,
     ),
+    build_hf_repo_model_entry(
+        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "sambanova/Meta-Llama-Guard-3-8B",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
 ]
diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py
index 3db95dcb4..d182aa1dc 100644
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@@ -5,305 +5,249 @@
 # the root directory of this source tree.
 
 import json
-from collections.abc import AsyncGenerator
+from collections.abc import Iterable
 
-from openai import OpenAI
+from openai.types.chat import (
+    ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartImageParam as OpenAIChatCompletionContentPartImageParam,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
+)
+from openai.types.chat import (
+    ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
+)
+from openai.types.chat import (
+    ChatCompletionMessageParam as OpenAIChatCompletionMessage,
+)
+from openai.types.chat import (
+    ChatCompletionMessageToolCallParam as OpenAIChatCompletionMessageToolCall,
+)
+from openai.types.chat import (
+    ChatCompletionSystemMessageParam as OpenAIChatCompletionSystemMessage,
+)
+from openai.types.chat import (
+    ChatCompletionToolMessageParam as OpenAIChatCompletionToolMessage,
+)
+from openai.types.chat import (
+    ChatCompletionUserMessageParam as OpenAIChatCompletionUserMessage,
+)
+from openai.types.chat.chat_completion_content_part_image_param import (
+    ImageURL as OpenAIImageURL,
+)
+from openai.types.chat.chat_completion_message_tool_call_param import (
+    Function as OpenAIFunction,
+)
 
 from llama_stack.apis.common.content_types import (
     ImageContentItem,
     InterleavedContent,
-    InterleavedContentItem,
     TextContentItem,
 )
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
-    ChatCompletionResponse,
     CompletionMessage,
-    EmbeddingsResponse,
-    EmbeddingTaskType,
-    GreedySamplingStrategy,
-    Inference,
-    LogProbConfig,
+    JsonSchemaResponseFormat,
     Message,
-    ResponseFormat,
-    SamplingParams,
-    StopReason,
     SystemMessage,
-    TextTruncation,
-    ToolCall,
     ToolChoice,
-    ToolConfig,
-    ToolDefinition,
-    ToolPromptFormat,
     ToolResponseMessage,
-    TopKSamplingStrategy,
-    TopPSamplingStrategy,
     UserMessage,
 )
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import BuiltinTool
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
-    process_chat_completion_stream_response,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    convert_image_content_to_url,
+    convert_tooldef_to_openai_tool,
+    get_sampling_options,
 )
+from llama_stack.providers.utils.inference.prompt_adapter import convert_image_content_to_url
 
 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
 
+logger = get_logger(name=__name__, category="inference")
 
-class SambaNovaInferenceAdapter(
-    ModelRegistryHelper,
-    Inference,
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
-):
-    def __init__(self, config: SambaNovaImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
-        self.config = config
 
-    async def initialize(self) -> None:
-        return
+async def convert_message_to_openai_dict_with_b64_images(
+    message: Message | dict,
+) -> OpenAIChatCompletionMessage:
+    """
+    Convert a Message to an OpenAI API-compatible dictionary.
+    """
+    # users can supply a dict instead of a Message object, we'll
+    # convert it to a Message object and proceed with some type safety.
+    if isinstance(message, dict):
+        if "role" not in message:
+            raise ValueError("role is required in message")
+        if message["role"] == "user":
+            message = UserMessage(**message)
+        elif message["role"] == "assistant":
+            message = CompletionMessage(**message)
+        elif message["role"] == "tool":
+            message = ToolResponseMessage(**message)
+        elif message["role"] == "system":
+            message = SystemMessage(**message)
+        else:
+            raise ValueError(f"Unsupported message role: {message['role']}")
 
-    async def shutdown(self) -> None:
-        pass
-
-    def _get_client(self) -> OpenAI:
-        return OpenAI(base_url=self.config.url, api_key=self.config.api_key)
-
-    async def completion(
-        self,
-        model_id: str,
+    # Map Llama Stack spec to OpenAI spec -
+    #  str -> str
+    #  {"type": "text", "text": ...} -> {"type": "text", "text": ...}
+    #  {"type": "image", "image": {"url": {"uri": ...}}} -> {"type": "image_url", "image_url": {"url": ...}}
+    #  {"type": "image", "image": {"data": ...}} -> {"type": "image_url", "image_url": {"url": "data:image/?;base64,..."}}
+    #  List[...] -> List[...]
+    async def _convert_message_content(
         content: InterleavedContent,
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> AsyncGenerator:
-        raise NotImplementedError()
-
-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: list[Message],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = ToolPromptFormat.json,
-        stream: bool | None = False,
-        tool_config: ToolConfig | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> AsyncGenerator:
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        model = await self.model_store.get_model(model_id)
-
-        request = ChatCompletionRequest(
-            model=model.provider_resource_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
-        request_sambanova = await self.convert_chat_completion_request(request)
-
-        if stream:
-            return self._stream_chat_completion(request_sambanova)
-        else:
-            return await self._nonstream_chat_completion(request_sambanova)
-
-    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
-        response = self._get_client().chat.completions.create(**request)
-
-        choice = response.choices[0]
-
-        result = ChatCompletionResponse(
-            completion_message=CompletionMessage(
-                content=choice.message.content or "",
-                stop_reason=self.convert_to_sambanova_finish_reason(choice.finish_reason),
-                tool_calls=self.convert_to_sambanova_tool_calls(choice.message.tool_calls),
-            ),
-            logprobs=None,
-        )
-
-        return result
-
-    async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
-        async def _to_async_generator():
-            streaming = self._get_client().chat.completions.create(**request)
-            for chunk in streaming:
-                yield chunk
-
-        stream = _to_async_generator()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            yield chunk
-
-    async def embeddings(
-        self,
-        model_id: str,
-        contents: list[str] | list[InterleavedContentItem],
-        text_truncation: TextTruncation | None = TextTruncation.none,
-        output_dimension: int | None = None,
-        task_type: EmbeddingTaskType | None = None,
-    ) -> EmbeddingsResponse:
-        raise NotImplementedError()
-
-    async def convert_chat_completion_request(self, request: ChatCompletionRequest) -> dict:
-        compatible_request = self.convert_sampling_params(request.sampling_params)
-        compatible_request["model"] = request.model
-        compatible_request["messages"] = await self.convert_to_sambanova_messages(request.messages)
-        compatible_request["stream"] = request.stream
-        compatible_request["logprobs"] = False
-        compatible_request["extra_headers"] = {
-            b"User-Agent": b"llama-stack: sambanova-inference-adapter",
-        }
-        compatible_request["tools"] = self.convert_to_sambanova_tool(request.tools)
-        return compatible_request
-
-    def convert_sampling_params(self, sampling_params: SamplingParams, legacy: bool = False) -> dict:
-        params = {}
-
-        if sampling_params:
-            params["frequency_penalty"] = sampling_params.repetition_penalty
-
-            if sampling_params.max_tokens:
-                if legacy:
-                    params["max_tokens"] = sampling_params.max_tokens
-                else:
-                    params["max_completion_tokens"] = sampling_params.max_tokens
-
-            if isinstance(sampling_params.strategy, TopPSamplingStrategy):
-                params["top_p"] = sampling_params.strategy.top_p
-            if isinstance(sampling_params.strategy, TopKSamplingStrategy):
-                params["extra_body"]["top_k"] = sampling_params.strategy.top_k
-            if isinstance(sampling_params.strategy, GreedySamplingStrategy):
-                params["temperature"] = 0.0
-
-        return params
-
-    async def convert_to_sambanova_messages(self, messages: list[Message]) -> list[dict]:
-        conversation = []
-        for message in messages:
-            content = {}
-
-            content["content"] = await self.convert_to_sambanova_content(message)
-
-            if isinstance(message, UserMessage):
-                content["role"] = "user"
-            elif isinstance(message, CompletionMessage):
-                content["role"] = "assistant"
-                tools = []
-                for tool_call in message.tool_calls:
-                    tools.append(
-                        {
-                            "id": tool_call.call_id,
-                            "function": {
-                                "name": tool_call.name,
-                                "arguments": json.dumps(tool_call.arguments),
-                            },
-                            "type": "function",
-                        }
-                    )
-                content["tool_calls"] = tools
-            elif isinstance(message, ToolResponseMessage):
-                content["role"] = "tool"
-                content["tool_call_id"] = message.call_id
-            elif isinstance(message, SystemMessage):
-                content["role"] = "system"
-
-            conversation.append(content)
-
-        return conversation
-
-    async def convert_to_sambanova_content(self, message: Message) -> dict:
-        async def _convert_content(content) -> dict:
-            if isinstance(content, ImageContentItem):
-                url = await convert_image_content_to_url(content, download=True)
-                # A fix to make sure the call sucess.
-                components = url.split(";base64")
-                url = f"{components[0].lower()};base64{components[1]}"
-                return {
-                    "type": "image_url",
-                    "image_url": {"url": url},
-                }
+    ) -> str | Iterable[OpenAIChatCompletionContentPartParam]:
+        async def impl(
+            content_: InterleavedContent,
+        ) -> str | OpenAIChatCompletionContentPartParam | list[OpenAIChatCompletionContentPartParam]:
+            # Llama Stack and OpenAI spec match for str and text input
+            if isinstance(content_, str):
+                return content_
+            elif isinstance(content_, TextContentItem):
+                return OpenAIChatCompletionContentPartTextParam(
+                    type="text",
+                    text=content_.text,
+                )
+            elif isinstance(content_, ImageContentItem):
+                return OpenAIChatCompletionContentPartImageParam(
+                    type="image_url",
+                    image_url=OpenAIImageURL(url=await convert_image_content_to_url(content_, download=True)),
+                )
+            elif isinstance(content_, list):
+                return [await impl(item) for item in content_]
             else:
-                text = content.text if isinstance(content, TextContentItem) else content
-                assert isinstance(text, str)
-                return {"type": "text", "text": text}
+                raise ValueError(f"Unsupported content type: {type(content_)}")
 
-        if isinstance(message.content, list):
-            # If it is a list, the text content should be wrapped in dict
-            content = [await _convert_content(c) for c in message.content]
+        ret = await impl(content)
+
+        # OpenAI*Message expects a str or list
+        if isinstance(ret, str) or isinstance(ret, list):
+            return ret
         else:
-            content = message.content
+            return [ret]
 
-        return content
+    out: OpenAIChatCompletionMessage = None
+    if isinstance(message, UserMessage):
+        out = OpenAIChatCompletionUserMessage(
+            role="user",
+            content=await _convert_message_content(message.content),
+        )
+    elif isinstance(message, CompletionMessage):
+        out = OpenAIChatCompletionAssistantMessage(
+            role="assistant",
+            content=await _convert_message_content(message.content),
+            tool_calls=[
+                OpenAIChatCompletionMessageToolCall(
+                    id=tool.call_id,
+                    function=OpenAIFunction(
+                        name=tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value,
+                        arguments=json.dumps(tool.arguments),
+                    ),
+                    type="function",
+                )
+                for tool in message.tool_calls
+            ]
+            or None,
+        )
+    elif isinstance(message, ToolResponseMessage):
+        out = OpenAIChatCompletionToolMessage(
+            role="tool",
+            tool_call_id=message.call_id,
+            content=await _convert_message_content(message.content),
+        )
+    elif isinstance(message, SystemMessage):
+        out = OpenAIChatCompletionSystemMessage(
+            role="system",
+            content=await _convert_message_content(message.content),
+        )
+    else:
+        raise ValueError(f"Unsupported message type: {type(message)}")
 
-    def convert_to_sambanova_tool(self, tools: list[ToolDefinition]) -> list[dict]:
-        if tools is None:
-            return tools
+    return out
 
-        compatiable_tools = []
 
-        for tool in tools:
-            properties = {}
-            compatiable_required = []
-            if tool.parameters:
-                for tool_key, tool_param in tool.parameters.items():
-                    properties[tool_key] = {"type": tool_param.param_type}
-                    if tool_param.description:
-                        properties[tool_key]["description"] = tool_param.description
-                    if tool_param.default:
-                        properties[tool_key]["default"] = tool_param.default
-                    if tool_param.required:
-                        compatiable_required.append(tool_key)
+class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: SambaNovaImplConfig
 
-            compatiable_tool = {
-                "type": "function",
-                "function": {
-                    "name": tool.tool_name,
-                    "description": tool.description,
-                    "parameters": {
-                        "type": "object",
-                        "properties": properties,
-                        "required": compatiable_required,
-                    },
+    def __init__(self, config: SambaNovaImplConfig):
+        self.config = config
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=self.config.api_key,
+            provider_data_api_key_field="sambanova_api_key",
+        )
+
+    def _get_api_key(self) -> str:
+        config_api_key = self.config.api_key if self.config.api_key else None
+        if config_api_key:
+            return config_api_key.get_secret_value()
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.sambanova_api_key:
+                raise ValueError(
+                    'Pass Sambanova API Key in the header X-LlamaStack-Provider-Data as { "sambanova_api_key":  }'
+                )
+            return provider_data.sambanova_api_key
+
+    async def _get_params(self, request: ChatCompletionRequest) -> dict:
+        input_dict = {}
+
+        input_dict["messages"] = [await convert_message_to_openai_dict_with_b64_images(m) for m in request.messages]
+        if fmt := request.response_format:
+            if not isinstance(fmt, JsonSchemaResponseFormat):
+                raise ValueError(
+                    f"Unsupported response format: {type(fmt)}. Only JsonSchemaResponseFormat is supported."
+                )
+
+            fmt = fmt.json_schema
+            name = fmt["title"]
+            del fmt["title"]
+            fmt["additionalProperties"] = False
+
+            # Apply additionalProperties: False recursively to all objects
+            fmt = self._add_additional_properties_recursive(fmt)
+
+            input_dict["response_format"] = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": name,
+                    "schema": fmt,
+                    "strict": True,
                 },
             }
+        if request.tools:
+            input_dict["tools"] = [convert_tooldef_to_openai_tool(tool) for tool in request.tools]
+            if request.tool_config.tool_choice:
+                input_dict["tool_choice"] = (
+                    request.tool_config.tool_choice.value
+                    if isinstance(request.tool_config.tool_choice, ToolChoice)
+                    else request.tool_config.tool_choice
+                )
 
-            compatiable_tools.append(compatiable_tool)
+        provider_data = self.get_request_provider_data()
+        key_field = self.provider_data_api_key_field
+        if provider_data and getattr(provider_data, key_field, None):
+            api_key = getattr(provider_data, key_field)
+        else:
+            api_key = self._get_api_key()
 
-        if len(compatiable_tools) > 0:
-            return compatiable_tools
-        return None
-
-    def convert_to_sambanova_finish_reason(self, finish_reason: str) -> StopReason:
         return {
-            "stop": StopReason.end_of_turn,
-            "length": StopReason.out_of_tokens,
-            "tool_calls": StopReason.end_of_message,
-        }.get(finish_reason, StopReason.end_of_turn)
+            "model": request.model,
+            "api_key": api_key,
+            "api_base": self.config.url,
+            **input_dict,
+            "stream": request.stream,
+            **get_sampling_options(request.sampling_params),
+        }
 
-    def convert_to_sambanova_tool_calls(
-        self,
-        tool_calls,
-    ) -> list[ToolCall]:
-        if not tool_calls:
-            return []
+    async def initialize(self):
+        await super().initialize()
 
-        compitable_tool_calls = [
-            ToolCall(
-                call_id=call.id,
-                tool_name=call.function.name,
-                arguments=json.loads(call.function.arguments),
-                arguments_json=call.function.arguments,
-            )
-            for call in tool_calls
-        ]
-
-        return compitable_tool_calls
+    async def shutdown(self):
+        await super().shutdown()
diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
index e31a3364c..60afe91ca 100644
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import SambaNovaCompatConfig
 
 
-async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .sambanova import SambaNovaCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/together_openai_compat/__init__.py b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
index 6fdf05b7e..8213fc5f4 100644
--- a/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 
 from .config import TogetherCompatConfig
 
 
-async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> InferenceProvider:
     # import dynamically so the import is used only when it is needed
     from .together import TogetherCompatInferenceAdapter
 
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index addf2d35b..d00218dd5 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -158,27 +158,28 @@ def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
     }.get(finish_reason, StopReason.end_of_turn)
 
 
-async def _process_vllm_chat_completion_stream_response(
-    stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
-) -> AsyncGenerator:
-    event_type = ChatCompletionResponseEventType.start
-    tool_call_buf = UnparseableToolCall()
-    async for chunk in stream:
-        if not chunk.choices:
-            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
-            continue
-        choice = chunk.choices[0]
-        if choice.finish_reason:
-            args_str = tool_call_buf.arguments
-            args = None
-            try:
-                args = {} if not args_str else json.loads(args_str)
-            except Exception as e:
-                log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
-            if args:
-                yield ChatCompletionResponseStreamChunk(
+def _process_vllm_chat_completion_end_of_stream(
+    finish_reason: str | None,
+    last_chunk_content: str | None,
+    current_event_type: ChatCompletionResponseEventType,
+    tool_call_bufs: dict[str, UnparseableToolCall] | None = None,
+) -> list[OpenAIChatCompletionChunk]:
+    chunks = []
+
+    if finish_reason is not None:
+        stop_reason = _convert_to_vllm_finish_reason(finish_reason)
+    else:
+        stop_reason = StopReason.end_of_message
+
+    tool_call_bufs = tool_call_bufs or {}
+    for _index, tool_call_buf in sorted(tool_call_bufs.items()):
+        args_str = tool_call_buf.arguments or "{}"
+        try:
+            args = json.loads(args_str)
+            chunks.append(
+                ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
-                        event_type=event_type,
+                        event_type=current_event_type,
                         delta=ToolCallDelta(
                             tool_call=ToolCall(
                                 call_id=tool_call_buf.call_id,
@@ -190,8 +191,12 @@ async def _process_vllm_chat_completion_stream_response(
                         ),
                     )
                 )
-            elif args_str:
-                yield ChatCompletionResponseStreamChunk(
+            )
+        except Exception as e:
+            log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
+
+            chunks.append(
+                ChatCompletionResponseStreamChunk(
                     event=ChatCompletionResponseEvent(
                         event_type=ChatCompletionResponseEventType.progress,
                         delta=ToolCallDelta(
@@ -200,21 +205,62 @@ async def _process_vllm_chat_completion_stream_response(
                         ),
                     )
                 )
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=ChatCompletionResponseEventType.complete,
-                    delta=TextDelta(text=choice.delta.content or ""),
-                    logprobs=None,
-                    stop_reason=_convert_to_vllm_finish_reason(choice.finish_reason),
-                )
             )
-        elif choice.delta.tool_calls:
-            tool_call = convert_tool_call(choice.delta.tool_calls[0])
-            tool_call_buf.tool_name += str(tool_call.tool_name)
-            tool_call_buf.call_id += tool_call.call_id
-            # TODO: remove str() when dict type for 'arguments' is no longer allowed
-            tool_call_buf.arguments += str(tool_call.arguments)
-        else:
+
+    chunks.append(
+        ChatCompletionResponseStreamChunk(
+            event=ChatCompletionResponseEvent(
+                event_type=ChatCompletionResponseEventType.complete,
+                delta=TextDelta(text=last_chunk_content or ""),
+                logprobs=None,
+                stop_reason=stop_reason,
+            )
+        )
+    )
+
+    return chunks
+
+
+async def _process_vllm_chat_completion_stream_response(
+    stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
+) -> AsyncGenerator:
+    yield ChatCompletionResponseStreamChunk(
+        event=ChatCompletionResponseEvent(
+            event_type=ChatCompletionResponseEventType.start,
+            delta=TextDelta(text=""),
+        )
+    )
+    event_type = ChatCompletionResponseEventType.progress
+    tool_call_bufs: dict[str, UnparseableToolCall] = {}
+    end_of_stream_processed = False
+
+    async for chunk in stream:
+        if not chunk.choices:
+            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
+            return
+        choice = chunk.choices[0]
+        if choice.delta.tool_calls:
+            for delta_tool_call in choice.delta.tool_calls:
+                tool_call = convert_tool_call(delta_tool_call)
+                if delta_tool_call.index not in tool_call_bufs:
+                    tool_call_bufs[delta_tool_call.index] = UnparseableToolCall()
+                tool_call_buf = tool_call_bufs[delta_tool_call.index]
+                tool_call_buf.tool_name += str(tool_call.tool_name)
+                tool_call_buf.call_id += tool_call.call_id
+                tool_call_buf.arguments += (
+                    tool_call.arguments if isinstance(tool_call.arguments, str) else json.dumps(tool_call.arguments)
+                )
+        if choice.finish_reason:
+            chunks = _process_vllm_chat_completion_end_of_stream(
+                finish_reason=choice.finish_reason,
+                last_chunk_content=choice.delta.content,
+                current_event_type=event_type,
+                tool_call_bufs=tool_call_bufs,
+            )
+            for c in chunks:
+                yield c
+            end_of_stream_processed = True
+        elif not choice.delta.tool_calls:
             yield ChatCompletionResponseStreamChunk(
                 event=ChatCompletionResponseEvent(
                     event_type=event_type,
@@ -224,6 +270,17 @@ async def _process_vllm_chat_completion_stream_response(
             )
             event_type = ChatCompletionResponseEventType.progress
 
+    if end_of_stream_processed:
+        return
+
+    # the stream ended without a chunk containing finish_reason - we have to generate the
+    # respective completion chunks manually
+    chunks = _process_vllm_chat_completion_end_of_stream(
+        finish_reason=None, last_chunk_content=None, current_event_type=event_type, tool_call_bufs=tool_call_bufs
+    )
+    for c in chunks:
+        yield c
+
 
 class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
     def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
@@ -272,6 +329,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         request = CompletionRequest(
             model=model.provider_resource_id,
             content=content,
@@ -302,6 +361,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         if sampling_params is None:
             sampling_params = SamplingParams()
         model = await self._get_model(model_id)
+        if model.provider_resource_id is None:
+            raise ValueError(f"Model {model_id} has no provider_resource_id set")
         # This is to be consistent with OpenAI API and support vLLM <= v0.6.3
         # References:
         #   * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py
index 5381a48ef..a919963ab 100644
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@@ -26,8 +26,7 @@ from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
 
 log = logging.getLogger(__name__)
 
-
-ChromaClientType = chromadb.AsyncHttpClient | chromadb.PersistentClient
+ChromaClientType = chromadb.api.AsyncClientAPI | chromadb.api.ClientAPI
 
 
 # this is a helper to allow us to use async and non-async chroma clients interchangeably
diff --git a/llama_stack/providers/tests/ci_test_config.yaml b/llama_stack/providers/tests/ci_test_config.yaml
deleted file mode 100644
index 3edcd38bf..000000000
--- a/llama_stack/providers/tests/ci_test_config.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-inference:
-  tests:
-  - inference/test_vision_inference.py::test_vision_chat_completion_streaming
-  - inference/test_vision_inference.py::test_vision_chat_completion_non_streaming
-  - inference/test_text_inference.py::test_structured_output
-  - inference/test_text_inference.py::test_chat_completion_streaming
-  - inference/test_text_inference.py::test_chat_completion_non_streaming
-  - inference/test_text_inference.py::test_chat_completion_with_tool_calling
-  - inference/test_text_inference.py::test_chat_completion_with_tool_calling_streaming
-
-  scenarios:
-  - provider_fixtures:
-      inference: ollama
-  - fixture_combo_id: fireworks
-  - provider_fixtures:
-      inference: together
-    # - inference: tgi
-    # - inference: vllm_remote
-
-  inference_models:
-  - meta-llama/Llama-3.1-8B-Instruct
-  - meta-llama/Llama-3.2-11B-Vision-Instruct
-
-
-agents:
-  tests:
-   - agents/test_agents.py::test_agent_turns_with_safety
-   - agents/test_agents.py::test_rag_agent
-
-  scenarios:
-  - fixture_combo_id: ollama
-  - fixture_combo_id: together
-  - fixture_combo_id: fireworks
-
-  inference_models:
-  - meta-llama/Llama-3.2-1B-Instruct
-
-  safety_shield: meta-llama/Llama-Guard-3-1B
-
-
-memory:
-  tests:
-   - memory/test_memory.py::test_query_documents
-
-  scenarios:
-  - fixture_combo_id: ollama
-  - provider_fixtures:
-      inference: sentence_transformers
-      memory: faiss
-  - fixture_combo_id: chroma
-
-  inference_models:
-  - meta-llama/Llama-3.2-1B-Instruct
-
-  embedding_model: all-MiniLM-L6-v2
diff --git a/llama_stack/providers/tests/conftest.py b/llama_stack/providers/tests/conftest.py
deleted file mode 100644
index cd86af0d6..000000000
--- a/llama_stack/providers/tests/conftest.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-
-import pytest
-import yaml
-from dotenv import load_dotenv
-from pydantic import BaseModel, Field
-from termcolor import colored
-
-from llama_stack.distribution.datatypes import Provider
-from llama_stack.providers.datatypes import RemoteProviderConfig
-
-from .env import get_env_or_fail
-from .report import Report
-
-
-class ProviderFixture(BaseModel):
-    providers: list[Provider]
-    provider_data: dict[str, Any] | None = None
-
-
-class TestScenario(BaseModel):
-    # provider fixtures can be either a mark or a dictionary of api -> providers
-    provider_fixtures: dict[str, str] = Field(default_factory=dict)
-    fixture_combo_id: str | None = None
-
-
-class APITestConfig(BaseModel):
-    scenarios: list[TestScenario] = Field(default_factory=list)
-    inference_models: list[str] = Field(default_factory=list)
-
-    # test name format should be ::
-    tests: list[str] = Field(default_factory=list)
-
-
-class MemoryApiTestConfig(APITestConfig):
-    embedding_model: str | None = Field(default_factory=None)
-
-
-class AgentsApiTestConfig(APITestConfig):
-    safety_shield: str | None = Field(default_factory=None)
-
-
-class TestConfig(BaseModel):
-    inference: APITestConfig | None = None
-    agents: AgentsApiTestConfig | None = None
-    memory: MemoryApiTestConfig | None = None
-
-
-def get_test_config_from_config_file(metafunc_config):
-    config_file = metafunc_config.getoption("--config")
-    if config_file is None:
-        return None
-
-    config_file_path = Path(__file__).parent / config_file
-    if not config_file_path.exists():
-        raise ValueError(
-            f"Test config {config_file} was specified but not found. Please make sure it exists in the llama_stack/providers/tests directory."
-        )
-    with open(config_file_path) as config_file:
-        config = yaml.safe_load(config_file)
-        return TestConfig(**config)
-
-
-def get_test_config_for_api(metafunc_config, api):
-    test_config = get_test_config_from_config_file(metafunc_config)
-    if test_config is None:
-        return None
-    return getattr(test_config, api)
-
-
-def get_provider_fixture_overrides_from_test_config(metafunc_config, api, default_provider_fixture_combinations):
-    api_config = get_test_config_for_api(metafunc_config, api)
-    if api_config is None:
-        return None
-
-    fixture_combo_ids = set()
-    custom_provider_fixture_combos = []
-    for scenario in api_config.scenarios:
-        if scenario.fixture_combo_id:
-            fixture_combo_ids.add(scenario.fixture_combo_id)
-        else:
-            custom_provider_fixture_combos.append(
-                pytest.param(
-                    scenario.provider_fixtures,
-                    id=scenario.provider_fixtures.get("inference") or "",
-                )
-            )
-
-    if len(fixture_combo_ids) > 0:
-        for default_fixture in default_provider_fixture_combinations:
-            if default_fixture.id in fixture_combo_ids:
-                custom_provider_fixture_combos.append(default_fixture)
-    return custom_provider_fixture_combos
-
-
-def remote_stack_fixture() -> ProviderFixture:
-    if url := os.getenv("REMOTE_STACK_URL", None):
-        config = RemoteProviderConfig.from_url(url)
-    else:
-        config = RemoteProviderConfig(
-            host=get_env_or_fail("REMOTE_STACK_HOST"),
-            port=int(get_env_or_fail("REMOTE_STACK_PORT")),
-        )
-    return ProviderFixture(
-        providers=[
-            Provider(
-                provider_id="test::remote",
-                provider_type="test::remote",
-                config=config.model_dump(),
-            )
-        ],
-    )
-
-
-def pytest_configure(config):
-    config.option.tbstyle = "short"
-    config.option.disable_warnings = True
-
-    """Load environment variables at start of test run"""
-    # Load from .env file if it exists
-    env_file = Path(__file__).parent / ".env"
-    if env_file.exists():
-        load_dotenv(env_file)
-
-    # Load any environment variables passed via --env
-    env_vars = config.getoption("--env") or []
-    for env_var in env_vars:
-        key, value = env_var.split("=", 1)
-        os.environ[key] = value
-
-    if config.getoption("--output") is not None:
-        config.pluginmanager.register(Report(config.getoption("--output")))
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--providers",
-        default="",
-        help=(
-            "Provider configuration in format: api1=provider1,api2=provider2. "
-            "Example: --providers inference=ollama,safety=meta-reference"
-        ),
-    )
-    parser.addoption(
-        "--config",
-        action="store",
-        help="Set test config file (supported format: YAML), e.g. --config=test_config.yml",
-    )
-    parser.addoption(
-        "--output",
-        action="store",
-        help="Set output file for test report, e.g. --output=pytest_report.md",
-    )
-    """Add custom command line options"""
-    parser.addoption("--env", action="append", help="Set environment variables, e.g. --env KEY=value")
-    parser.addoption(
-        "--inference-model",
-        action="store",
-        default="meta-llama/Llama-3.2-3B-Instruct",
-        help="Specify the inference model to use for testing",
-    )
-    parser.addoption(
-        "--safety-shield",
-        action="store",
-        default="meta-llama/Llama-Guard-3-1B",
-        help="Specify the safety shield to use for testing",
-    )
-    parser.addoption(
-        "--embedding-model",
-        action="store",
-        default=None,
-        help="Specify the embedding model to use for testing",
-    )
-    parser.addoption(
-        "--judge-model",
-        action="store",
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        help="Specify the judge model to use for testing",
-    )
-
-
-def make_provider_id(providers: dict[str, str]) -> str:
-    return ":".join(f"{api}={provider}" for api, provider in sorted(providers.items()))
-
-
-def get_provider_marks(providers: dict[str, str]) -> list[Any]:
-    marks = []
-    for provider in providers.values():
-        marks.append(getattr(pytest.mark, provider))
-    return marks
-
-
-def get_provider_fixture_overrides(config, available_fixtures: dict[str, list[str]]) -> list[pytest.param] | None:
-    provider_str = config.getoption("--providers")
-    if not provider_str:
-        return None
-
-    fixture_dict = parse_fixture_string(provider_str, available_fixtures)
-    return [
-        pytest.param(
-            fixture_dict,
-            id=make_provider_id(fixture_dict),
-            marks=get_provider_marks(fixture_dict),
-        )
-    ]
-
-
-def parse_fixture_string(provider_str: str, available_fixtures: dict[str, list[str]]) -> dict[str, str]:
-    """Parse provider string of format 'api1=provider1,api2=provider2'"""
-    if not provider_str:
-        return {}
-
-    fixtures = {}
-    pairs = provider_str.split(",")
-    for pair in pairs:
-        if "=" not in pair:
-            raise ValueError(f"Invalid provider specification: {pair}. Expected format: api=provider")
-        api, fixture = pair.split("=")
-        if api not in available_fixtures:
-            raise ValueError(f"Unknown API: {api}. Available APIs: {list(available_fixtures.keys())}")
-        if fixture not in available_fixtures[api]:
-            raise ValueError(
-                f"Unknown provider '{fixture}' for API '{api}'. Available providers: {list(available_fixtures[api])}"
-            )
-        fixtures[api] = fixture
-
-    # Check that all provided APIs are supported
-    for api in available_fixtures.keys():
-        if api not in fixtures:
-            raise ValueError(
-                f"Missing provider fixture for API '{api}'. Available providers: {list(available_fixtures[api])}"
-            )
-    return fixtures
-
-
-def pytest_itemcollected(item):
-    # Get all markers as a list
-    filtered = ("asyncio", "parametrize")
-    marks = [mark.name for mark in item.iter_markers() if mark.name not in filtered]
-    if marks:
-        marks = colored(",".join(marks), "yellow")
-        item.name = f"{item.name}[{marks}]"
-
-
-def pytest_collection_modifyitems(session, config, items):
-    test_config = get_test_config_from_config_file(config)
-    if test_config is None:
-        return
-
-    required_tests = defaultdict(set)
-    for api_test_config in [
-        test_config.inference,
-        test_config.memory,
-        test_config.agents,
-    ]:
-        if api_test_config is None:
-            continue
-        for test in api_test_config.tests:
-            arr = test.split("::")
-            if len(arr) != 2:
-                raise ValueError(f"Invalid format for test name {test}")
-            test_path, func_name = arr
-            required_tests[Path(__file__).parent / test_path].add(func_name)
-
-    new_items, deselected_items = [], []
-    for item in items:
-        func_name = getattr(item, "originalname", item.name)
-        if func_name in required_tests[item.fspath]:
-            new_items.append(item)
-            continue
-        deselected_items.append(item)
-
-    items[:] = new_items
-    config.hook.pytest_deselected(items=deselected_items)
-
-
-pytest_plugins = [
-    "llama_stack.providers.tests.inference.fixtures",
-    "llama_stack.providers.tests.safety.fixtures",
-    "llama_stack.providers.tests.vector_io.fixtures",
-    "llama_stack.providers.tests.agents.fixtures",
-    "llama_stack.providers.tests.datasetio.fixtures",
-    "llama_stack.providers.tests.scoring.fixtures",
-    "llama_stack.providers.tests.eval.fixtures",
-    "llama_stack.providers.tests.post_training.fixtures",
-    "llama_stack.providers.tests.tools.fixtures",
-]
diff --git a/llama_stack/providers/tests/report.py b/llama_stack/providers/tests/report.py
deleted file mode 100644
index bc29534be..000000000
--- a/llama_stack/providers/tests/report.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from collections import defaultdict
-from pathlib import Path
-
-import pytest
-from pytest import ExitCode
-from pytest_html.basereport import _process_outcome
-
-from llama_stack.models.llama.sku_list import all_registered_models
-from llama_stack.models.llama.sku_types import CoreModelId
-
-INFERENCE_APIS = ["chat_completion"]
-FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
-SUPPORTED_MODELS = {
-    "ollama": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_1b.value,
-    },
-    "fireworks": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_1b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_11b_vision.value,
-    },
-    "together": {
-        CoreModelId.llama3_1_8b_instruct.value,
-        CoreModelId.llama3_1_70b_instruct.value,
-        CoreModelId.llama3_1_405b_instruct.value,
-        CoreModelId.llama3_2_3b_instruct.value,
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-        CoreModelId.llama3_3_70b_instruct.value,
-        CoreModelId.llama_guard_3_8b.value,
-        CoreModelId.llama_guard_3_11b_vision.value,
-    },
-}
-
-
-class Report:
-    def __init__(self, output_path):
-        valid_file_format = (
-            output_path.split(".")[1] in ["md", "markdown"] if len(output_path.split(".")) == 2 else False
-        )
-        if not valid_file_format:
-            raise ValueError(f"Invalid output file {output_path}. Markdown file is required")
-        self.output_path = output_path
-        self.test_data = defaultdict(dict)
-        self.inference_tests = defaultdict(dict)
-
-    @pytest.hookimpl
-    def pytest_runtest_logreport(self, report):
-        # This hook is called in several phases, including setup, call and teardown
-        # The test is considered failed / error if any of the outcomes is not "Passed"
-        outcome = _process_outcome(report)
-        data = {
-            "outcome": report.outcome,
-            "longrepr": report.longrepr,
-            "name": report.nodeid,
-        }
-        if report.nodeid not in self.test_data:
-            self.test_data[report.nodeid] = data
-        elif self.test_data[report.nodeid] != outcome and outcome != "Passed":
-            self.test_data[report.nodeid] = data
-
-    @pytest.hookimpl
-    def pytest_sessionfinish(self, session, exitstatus):
-        if exitstatus <= ExitCode.INTERRUPTED:
-            return
-        report = []
-        report.append("# Llama Stack Integration Test Results Report")
-        report.append("\n## Summary")
-        report.append("\n## Supported Models: ")
-
-        header = "| Model Descriptor |"
-        dividor = "|:---|"
-        for k in SUPPORTED_MODELS.keys():
-            header += f"{k} |"
-            dividor += ":---:|"
-
-        report.append(header)
-        report.append(dividor)
-
-        rows = []
-        for model in all_registered_models():
-            if "Instruct" not in model.core_model_id.value and "Guard" not in model.core_model_id.value:
-                continue
-            row = f"| {model.core_model_id.value} |"
-            for k in SUPPORTED_MODELS.keys():
-                if model.core_model_id.value in SUPPORTED_MODELS[k]:
-                    row += " ✅ |"
-                else:
-                    row += " ❌ |"
-            rows.append(row)
-        report.extend(rows)
-
-        report.append("\n### Tests:")
-
-        for provider in SUPPORTED_MODELS.keys():
-            if provider not in self.inference_tests:
-                continue
-            report.append(f"\n #### {provider}")
-            test_table = [
-                "| Area | Model | API | Functionality Test | Status |",
-                "|:-----|:-----|:-----|:-----|:-----|",
-            ]
-            for api in INFERENCE_APIS:
-                tests = self.inference_tests[provider][api]
-                for test_nodeid in tests:
-                    row = "|{area} | {model} | {api} | {test} | {result} ".format(
-                        area="Text" if "text" in test_nodeid else "Vision",
-                        model=("Llama-3.1-8B-Instruct" if "text" in test_nodeid else "Llama3.2-11B-Vision-Instruct"),
-                        api=f"/{api}",
-                        test=self.get_simple_function_name(test_nodeid),
-                        result=("✅" if self.test_data[test_nodeid]["outcome"] == "passed" else "❌"),
-                    )
-                    test_table += [row]
-            report.extend(test_table)
-            report.append("\n")
-
-        output_file = Path(self.output_path)
-        output_file.write_text("\n".join(report))
-        print(f"\n Report generated: {output_file.absolute()}")
-
-    @pytest.hookimpl(trylast=True)
-    def pytest_collection_modifyitems(self, session, config, items):
-        for item in items:
-            inference = item.callspec.params.get("inference_stack")
-            if "inference" in item.nodeid:
-                func_name = getattr(item, "originalname", item.name)
-                for api in INFERENCE_APIS:
-                    if api in func_name:
-                        api_tests = self.inference_tests[inference].get(api, set())
-                        api_tests.add(item.nodeid)
-                        self.inference_tests[inference][api] = api_tests
-
-    def get_simple_function_name(self, nodeid):
-        """Extract function name from nodeid.
-
-        Examples:
-        - 'tests/test_math.py::test_addition' -> 'test_addition'
-        - 'tests/test_math.py::TestClass::test_method' -> test_method'
-        """
-        parts = nodeid.split("::")
-        func_name = nodeid  # Fallback to full nodeid if pattern doesn't match
-        if len(parts) == 2:  # Simple function
-            func_name = parts[1]
-        elif len(parts) == 3:  # Class method
-            func_name = parts[2]
-        return func_name.split("[")[0]
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index c3c2ab61f..4d17db21e 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -19,7 +19,7 @@ from llama_stack.apis.inference import (
     ChatCompletionResponseStreamChunk,
     EmbeddingsResponse,
     EmbeddingTaskType,
-    Inference,
+    InferenceProvider,
     JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
@@ -59,9 +59,12 @@ logger = get_logger(name=__name__, category="inference")
 
 class LiteLLMOpenAIMixin(
     ModelRegistryHelper,
-    Inference,
+    InferenceProvider,
     NeedsRequestProviderData,
 ):
+    # TODO: avoid exposing the litellm specific model names to the user.
+    #       potential change: add a prefix param that gets added to the model name
+    #                         when calling litellm.
     def __init__(
         self,
         model_entries,
@@ -92,7 +95,9 @@ class LiteLLMOpenAIMixin(
         return model
 
     def get_litellm_model_name(self, model_id: str) -> str:
-        return "openai/" + model_id if self.is_openai_compat else model_id
+        # users may be using openai/ prefix in their model names. the openai/models.py did this by default.
+        # model_id.startswith("openai/") is for backwards compatibility.
+        return "openai/" + model_id if self.is_openai_compat and not model_id.startswith("openai/") else model_id
 
     async def completion(
         self,
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index e2314d44f..cc0000528 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -531,13 +531,19 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
             tool_name = tc.tool_name
             if isinstance(tool_name, BuiltinTool):
                 tool_name = tool_name.value
+
+            # arguments_json can be None, so attempt it first and fall back to arguments
+            if hasattr(tc, "arguments_json") and tc.arguments_json:
+                arguments = tc.arguments_json
+            else:
+                arguments = json.dumps(tc.arguments)
             result["tool_calls"].append(
                 {
                     "id": tc.call_id,
                     "type": "function",
                     "function": {
                         "name": tool_name,
-                        "arguments": tc.arguments_json if hasattr(tc, "arguments_json") else json.dumps(tc.arguments),
+                        "arguments": arguments,
                     },
                 }
             )
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index d53b51537..56e33cfdf 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -382,7 +382,7 @@ def augment_messages_for_tools_llama_3_1(
 
     messages.append(SystemMessage(content=sys_content))
 
-    has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
+    has_custom_tools = request.tools is not None and any(isinstance(dfn.tool_name, str) for dfn in request.tools)
     if has_custom_tools:
         fmt = request.tool_config.tool_prompt_format or ToolPromptFormat.json
         if fmt == ToolPromptFormat.json:
diff --git a/llama_stack/providers/utils/kvstore/api.py b/llama_stack/providers/utils/kvstore/api.py
index 8efde4ea9..d17dc66e1 100644
--- a/llama_stack/providers/utils/kvstore/api.py
+++ b/llama_stack/providers/utils/kvstore/api.py
@@ -16,4 +16,6 @@ class KVStore(Protocol):
 
     async def delete(self, key: str) -> None: ...
 
-    async def range(self, start_key: str, end_key: str) -> list[str]: ...
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]: ...
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]: ...
diff --git a/llama_stack/providers/utils/kvstore/kvstore.py b/llama_stack/providers/utils/kvstore/kvstore.py
index 0eb969b65..3a1ee8a26 100644
--- a/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/llama_stack/providers/utils/kvstore/kvstore.py
@@ -26,9 +26,16 @@ class InmemoryKVStoreImpl(KVStore):
     async def set(self, key: str, value: str) -> None:
         self._store[key] = value
 
-    async def range(self, start_key: str, end_key: str) -> list[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         return [self._store[key] for key in self._store.keys() if key >= start_key and key < end_key]
 
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        return [key for key in self._store.keys() if key >= start_key and key < end_key]
+
+    async def delete(self, key: str) -> None:
+        del self._store[key]
+
 
 async def kvstore_impl(config: KVStoreConfig) -> KVStore:
     if config.type == KVStoreType.redis.value:
diff --git a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
index 330a079bf..3842773d9 100644
--- a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@@ -57,7 +57,7 @@ class MongoDBKVStoreImpl(KVStore):
         key = self._namespaced_key(key)
         await self.collection.delete_one({"key": key})
 
-    async def range(self, start_key: str, end_key: str) -> list[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
         query = {
@@ -68,3 +68,10 @@ class MongoDBKVStoreImpl(KVStore):
         async for doc in cursor:
             result.append(doc["value"])
         return result
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        start_key = self._namespaced_key(start_key)
+        end_key = self._namespaced_key(end_key)
+        query = {"key": {"$gte": start_key, "$lt": end_key}}
+        cursor = self.collection.find(query, {"key": 1, "_id": 0}).sort("key", 1)
+        return [doc["key"] for doc in cursor]
diff --git a/llama_stack/providers/utils/kvstore/postgres/postgres.py b/llama_stack/providers/utils/kvstore/postgres/postgres.py
index 6bfbc4f81..bd35decfc 100644
--- a/llama_stack/providers/utils/kvstore/postgres/postgres.py
+++ b/llama_stack/providers/utils/kvstore/postgres/postgres.py
@@ -85,7 +85,7 @@ class PostgresKVStoreImpl(KVStore):
             (key,),
         )
 
-    async def range(self, start_key: str, end_key: str) -> list[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
 
@@ -99,3 +99,13 @@ class PostgresKVStoreImpl(KVStore):
             (start_key, end_key),
         )
         return [row[0] for row in self.cursor.fetchall()]
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        start_key = self._namespaced_key(start_key)
+        end_key = self._namespaced_key(end_key)
+
+        self.cursor.execute(
+            f"SELECT key FROM {self.config.table_name} WHERE key >= %s AND key < %s",
+            (start_key, end_key),
+        )
+        return [row[0] for row in self.cursor.fetchall()]
diff --git a/llama_stack/providers/utils/kvstore/redis/redis.py b/llama_stack/providers/utils/kvstore/redis/redis.py
index d95de0291..3d2d956c3 100644
--- a/llama_stack/providers/utils/kvstore/redis/redis.py
+++ b/llama_stack/providers/utils/kvstore/redis/redis.py
@@ -42,7 +42,7 @@ class RedisKVStoreImpl(KVStore):
         key = self._namespaced_key(key)
         await self.redis.delete(key)
 
-    async def range(self, start_key: str, end_key: str) -> list[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         start_key = self._namespaced_key(start_key)
         end_key = self._namespaced_key(end_key)
         cursor = 0
@@ -67,3 +67,10 @@ class RedisKVStoreImpl(KVStore):
             ]
 
         return []
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        matching_keys = await self.redis.zrangebylex(self.namespace, f"[{start_key}", f"[{end_key}")
+        if not matching_keys:
+            return []
+        return [k.decode("utf-8") for k in matching_keys]
diff --git a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
index faca77887..4e49e4d8c 100644
--- a/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
+++ b/llama_stack/providers/utils/kvstore/sqlite/sqlite.py
@@ -54,7 +54,7 @@ class SqliteKVStoreImpl(KVStore):
             await db.execute(f"DELETE FROM {self.table_name} WHERE key = ?", (key,))
             await db.commit()
 
-    async def range(self, start_key: str, end_key: str) -> list[str]:
+    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
         async with aiosqlite.connect(self.db_path) as db:
             async with db.execute(
                 f"SELECT key, value, expiration FROM {self.table_name} WHERE key >= ? AND key <= ?",
@@ -65,3 +65,13 @@ class SqliteKVStoreImpl(KVStore):
                     _, value, _ = row
                     result.append(value)
                 return result
+
+    async def keys_in_range(self, start_key: str, end_key: str) -> list[str]:
+        """Get all keys in the given range."""
+        async with aiosqlite.connect(self.db_path) as db:
+            cursor = await db.execute(
+                f"SELECT key FROM {self.table_name} WHERE key >= ? AND key <= ?",
+                (start_key, end_key),
+            )
+            rows = await cursor.fetchall()
+            return [row[0] for row in rows]
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index f4834969a..e0e9d0679 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -118,45 +118,53 @@ async def content_from_doc(doc: RAGDocument) -> str:
     if isinstance(doc.content, URL):
         if doc.content.uri.startswith("data:"):
             return content_from_data(doc.content.uri)
-        else:
-            async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content.uri)
-            if doc.mime_type == "application/pdf":
-                return parse_pdf(r.content)
-            else:
-                return r.text
-
-    pattern = re.compile("^(https?://|file://|data:)")
-    if pattern.match(doc.content):
-        if doc.content.startswith("data:"):
-            return content_from_data(doc.content)
-        else:
+        async with httpx.AsyncClient() as client:
+            r = await client.get(doc.content.uri)
+        if doc.mime_type == "application/pdf":
+            return parse_pdf(r.content)
+        return r.text
+    elif isinstance(doc.content, str):
+        pattern = re.compile("^(https?://|file://|data:)")
+        if pattern.match(doc.content):
+            if doc.content.startswith("data:"):
+                return content_from_data(doc.content)
             async with httpx.AsyncClient() as client:
                 r = await client.get(doc.content)
             if doc.mime_type == "application/pdf":
                 return parse_pdf(r.content)
-            else:
-                return r.text
-
-    return interleaved_content_as_str(doc.content)
+            return r.text
+        return doc.content
+    else:
+        # will raise ValueError if the content is not List[InterleavedContent] or InterleavedContent
+        return interleaved_content_as_str(doc.content)
 
 
-def make_overlapped_chunks(document_id: str, text: str, window_len: int, overlap_len: int) -> list[Chunk]:
+def make_overlapped_chunks(
+    document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
+) -> list[Chunk]:
     tokenizer = Tokenizer.get_instance()
     tokens = tokenizer.encode(text, bos=False, eos=False)
+    try:
+        metadata_string = str(metadata)
+    except Exception as e:
+        raise ValueError("Failed to serialize metadata to string") from e
+
+    metadata_tokens = tokenizer.encode(metadata_string, bos=False, eos=False)
 
     chunks = []
     for i in range(0, len(tokens), window_len - overlap_len):
         toks = tokens[i : i + window_len]
         chunk = tokenizer.decode(toks)
+        chunk_metadata = metadata.copy()
+        chunk_metadata["document_id"] = document_id
+        chunk_metadata["token_count"] = len(toks)
+        chunk_metadata["metadata_token_count"] = len(metadata_tokens)
+
         # chunk is a string
         chunks.append(
             Chunk(
                 content=chunk,
-                metadata={
-                    "token_count": len(toks),
-                    "document_id": document_id,
-                },
+                metadata=chunk_metadata,
             )
         )
 
diff --git a/llama_stack/providers/utils/datasetio/pagination.py b/llama_stack/providers/utils/pagination.py
similarity index 100%
rename from llama_stack/providers/utils/datasetio/pagination.py
rename to llama_stack/providers/utils/pagination.py
diff --git a/llama_stack/templates/cerebras/doc_template.md b/llama_stack/templates/cerebras/doc_template.md
index 76f8c34ad..5cae2b2da 100644
--- a/llama_stack/templates/cerebras/doc_template.md
+++ b/llama_stack/templates/cerebras/doc_template.md
@@ -46,7 +46,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
 ```
diff --git a/llama_stack/templates/cerebras/report.md b/llama_stack/templates/cerebras/report.md
deleted file mode 100644
index f240e354b..000000000
--- a/llama_stack/templates/cerebras/report.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Report for cerebras distribution
-
-## Supported Models
-| Model Descriptor | cerebras |
-|:---|:---|
-| meta-llama/Llama-3-8B-Instruct | ❌ |
-| meta-llama/Llama-3-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-8B-Instruct | ✅ |
-| meta-llama/Llama-3.1-70B-Instruct | ❌ |
-| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ |
-| meta-llama/Llama-3.2-1B-Instruct | ❌ |
-| meta-llama/Llama-3.2-3B-Instruct | ❌ |
-| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct | ✅ |
-| meta-llama/Llama-Guard-3-11B-Vision | ❌ |
-| meta-llama/Llama-Guard-3-1B | ❌ |
-| meta-llama/Llama-Guard-3-8B | ❌ |
-| meta-llama/Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ❌ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ❌ |
diff --git a/llama_stack/templates/dell/doc_template.md b/llama_stack/templates/dell/doc_template.md
index 26f07130b..6bdd7f81c 100644
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@@ -143,7 +143,7 @@ docker run \
   -v $HOME/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env DEH_URL=$DEH_URL \
diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json
index e8136b0c3..fb4ab9fda 100644
--- a/llama_stack/templates/dependencies.json
+++ b/llama_stack/templates/dependencies.json
@@ -152,46 +152,6 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "dev": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "emoji",
-    "fastapi",
-    "fire",
-    "fireworks-ai",
-    "httpx",
-    "langdetect",
-    "litellm",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pymongo",
-    "pypdf",
-    "pythainlp",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "sqlite-vec",
-    "tqdm",
-    "transformers",
-    "tree_sitter",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
   "fireworks": [
     "aiosqlite",
     "autoevals",
@@ -481,6 +441,7 @@
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
     "pandas",
+    "peft",
     "pillow",
     "psycopg2-binary",
     "pymongo",
@@ -491,9 +452,11 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "torch",
     "tqdm",
     "transformers",
     "tree_sitter",
+    "trl",
     "uvicorn"
   ],
   "open-benchmark": [
@@ -619,10 +582,11 @@
     "fastapi",
     "fire",
     "httpx",
+    "litellm",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
-    "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
     "pandas",
@@ -637,7 +601,49 @@
     "sentencepiece",
     "tqdm",
     "transformers",
-    "uvicorn"
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "starter": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "emoji",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "langdetect",
+    "litellm",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "pythainlp",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlite-vec",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
   "tgi": [
     "aiohttp",
@@ -830,6 +836,8 @@
     "tqdm",
     "transformers",
     "tree_sitter",
-    "uvicorn"
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/llama_stack/templates/experimental-post-training/build.yaml b/llama_stack/templates/experimental-post-training/build.yaml
index b4b5e2203..55cd189c6 100644
--- a/llama_stack/templates/experimental-post-training/build.yaml
+++ b/llama_stack/templates/experimental-post-training/build.yaml
@@ -13,9 +13,10 @@ distribution_spec:
     - inline::basic
     - inline::braintrust
     post_training:
-    - inline::torchtune
+    - inline::huggingface
     datasetio:
     - inline::localfs
+    - remote::huggingface
     telemetry:
     - inline::meta-reference
     agents:
diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml
index 2ebdfe1aa..393cba41d 100644
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@@ -49,16 +49,24 @@ providers:
         type: sqlite
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/experimental-post-training}/localfs_datasetio.db
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/huggingface}/huggingface_datasetio.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config: {}
   post_training:
-  - provider_id: torchtune-post-training
-    provider_type: inline::torchtune
-    config: {
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
       checkpoint_format: huggingface
-    }
+      distributed_backend: null
+      device: cpu
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
diff --git a/llama_stack/templates/fireworks/report.md b/llama_stack/templates/fireworks/report.md
deleted file mode 100644
index b520acf8e..000000000
--- a/llama_stack/templates/fireworks/report.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Report for fireworks distribution
-
-## Supported Models
-| Model Descriptor | fireworks |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ❌ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
-| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
diff --git a/llama_stack/templates/nvidia/doc_template.md b/llama_stack/templates/nvidia/doc_template.md
index 3fc73ee5e..6138e2804 100644
--- a/llama_stack/templates/nvidia/doc_template.md
+++ b/llama_stack/templates/nvidia/doc_template.md
@@ -116,7 +116,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env NVIDIA_API_KEY=$NVIDIA_API_KEY
 ```
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index 88e61bf8a..7d5363575 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -23,6 +23,8 @@ distribution_spec:
     - inline::basic
     - inline::llm-as-judge
     - inline::braintrust
+    post_training:
+    - inline::huggingface
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index f961ab7ed..aaa65bab2 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -86,7 +86,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env SAFETY_MODEL=$SAFETY_MODEL \
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index d72d299ec..0b4f05128 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
     ShieldInput,
     ToolGroupInput,
 )
+from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
@@ -28,6 +29,7 @@ def get_distribution_template() -> DistributionTemplate:
         "eval": ["inline::meta-reference"],
         "datasetio": ["remote::huggingface", "inline::localfs"],
         "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "post_training": ["inline::huggingface"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
@@ -47,7 +49,11 @@ def get_distribution_template() -> DistributionTemplate:
         provider_type="inline::faiss",
         config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
     )
-
+    posttraining_provider = Provider(
+        provider_id="huggingface",
+        provider_type="inline::huggingface",
+        config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
     inference_model = ModelInput(
         model_id="${env.INFERENCE_MODEL}",
         provider_id="ollama",
@@ -92,6 +98,7 @@ def get_distribution_template() -> DistributionTemplate:
                 provider_overrides={
                     "inference": [inference_provider],
                     "vector_io": [vector_io_provider_faiss],
+                    "post_training": [posttraining_provider],
                 },
                 default_models=[inference_model, embedding_model],
                 default_tool_groups=default_tool_groups,
@@ -100,6 +107,7 @@ def get_distribution_template() -> DistributionTemplate:
                 provider_overrides={
                     "inference": [inference_provider],
                     "vector_io": [vector_io_provider_faiss],
+                    "post_training": [posttraining_provider],
                     "safety": [
                         Provider(
                             provider_id="llama-guard",
diff --git a/llama_stack/templates/ollama/report.md b/llama_stack/templates/ollama/report.md
deleted file mode 100644
index 4b2dada3a..000000000
--- a/llama_stack/templates/ollama/report.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Report for ollama distribution
-
-## Supported Models
-| Model Descriptor | ollama |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ❌ |
-| Llama-Guard-3-1B | ✅ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 651d58117..74d0822ca 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- post_training
 - safety
 - scoring
 - telemetry
@@ -80,6 +81,13 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 1372486fe..71229be97 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- post_training
 - safety
 - scoring
 - telemetry
@@ -78,6 +79,13 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md
index 3cede6080..5684888da 100644
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@@ -220,7 +220,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1
@@ -242,7 +242,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \
diff --git a/llama_stack/templates/sambanova/build.yaml b/llama_stack/templates/sambanova/build.yaml
index 6fd5b2905..81d90f420 100644
--- a/llama_stack/templates/sambanova/build.yaml
+++ b/llama_stack/templates/sambanova/build.yaml
@@ -1,9 +1,10 @@
 version: '2'
 distribution_spec:
-  description: Use SambaNova.AI for running LLM inference
+  description: Use SambaNova for running LLM inference
   providers:
     inference:
     - remote::sambanova
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     - remote::chromadb
@@ -18,4 +19,6 @@ distribution_spec:
     - remote::brave-search
     - remote::tavily-search
     - inline::rag-runtime
+    - remote::model-context-protocol
+    - remote::wolfram-alpha
 image_type: conda
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 2bf2bf722..620d50307 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -14,6 +14,9 @@ providers:
     config:
       url: https://api.sambanova.ai/v1
       api_key: ${env.SAMBANOVA_API_KEY}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -68,110 +71,122 @@ providers:
   - provider_id: rag-runtime
     provider_type: inline::rag-runtime
     config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/sambanova}/registry.db
 models:
 - metadata: {}
-  model_id: Meta-Llama-3.1-8B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: Meta-Llama-3.1-405B-Instruct
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-1B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-1B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-3B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-3B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.3-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.3-70B-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-11B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-90B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-Guard-3-8B
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: sambanova
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields:
 - shield_id: meta-llama/Llama-Guard-3-8B
 vector_dbs: []
@@ -183,5 +198,7 @@ tool_groups:
   provider_id: tavily-search
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 server:
   port: 8321
diff --git a/llama_stack/templates/sambanova/sambanova.py b/llama_stack/templates/sambanova/sambanova.py
index f1862221b..2f8a0b08a 100644
--- a/llama_stack/templates/sambanova/sambanova.py
+++ b/llama_stack/templates/sambanova/sambanova.py
@@ -6,7 +6,16 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import Provider, ShieldInput, ToolGroupInput
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig
 from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES
@@ -23,7 +32,7 @@ from llama_stack.templates.template import (
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::sambanova"],
+        "inference": ["remote::sambanova", "inline::sentence-transformers"],
         "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -32,16 +41,29 @@ def get_distribution_template() -> DistributionTemplate:
             "remote::brave-search",
             "remote::tavily-search",
             "inline::rag-runtime",
+            "remote::model-context-protocol",
+            "remote::wolfram-alpha",
         ],
     }
     name = "sambanova"
-
     inference_provider = Provider(
         provider_id=name,
         provider_type=f"remote::{name}",
         config=SambaNovaImplConfig.sample_run_config(),
     )
-
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
     vector_io_providers = [
         Provider(
             provider_id="faiss",
@@ -79,23 +101,27 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::rag",
             provider_id="rag-runtime",
         ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
     ]
 
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Use SambaNova.AI for running LLM inference",
-        docker_image=None,
+        description="Use SambaNova for running LLM inference",
+        container_image=None,
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
         available_models_by_provider=available_models,
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider],
+                    "inference": [inference_provider, embedding_provider],
                     "vector_io": vector_io_providers,
                 },
-                default_models=default_models,
+                default_models=default_models + [embedding_model],
                 default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
                 default_tool_groups=default_tool_groups,
             ),
@@ -107,7 +133,7 @@ def get_distribution_template() -> DistributionTemplate:
             ),
             "SAMBANOVA_API_KEY": (
                 "",
-                "SambaNova.AI API Key",
+                "SambaNova API Key",
             ),
         },
     )
diff --git a/llama_stack/templates/starter/__init__.py b/llama_stack/templates/starter/__init__.py
new file mode 100644
index 000000000..9c0d937ce
--- /dev/null
+++ b/llama_stack/templates/starter/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .starter import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/dev/build.yaml b/llama_stack/templates/starter/build.yaml
similarity index 86%
rename from llama_stack/templates/dev/build.yaml
rename to llama_stack/templates/starter/build.yaml
index df45f1319..35bd0c713 100644
--- a/llama_stack/templates/dev/build.yaml
+++ b/llama_stack/templates/starter/build.yaml
@@ -1,6 +1,6 @@
 version: '2'
 distribution_spec:
-  description: Distribution for running e2e tests in CI
+  description: Quick start template for running Llama Stack with several popular providers
   providers:
     inference:
     - remote::openai
@@ -8,6 +8,7 @@ distribution_spec:
     - remote::anthropic
     - remote::gemini
     - remote::groq
+    - remote::sambanova
     - inline::sentence-transformers
     vector_io:
     - inline::sqlite-vec
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/starter/run.yaml
similarity index 67%
rename from llama_stack/templates/dev/run.yaml
rename to llama_stack/templates/starter/run.yaml
index b77650bfe..402695850 100644
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@@ -1,5 +1,5 @@
 version: '2'
-image_name: dev
+image_name: starter
 apis:
 - agents
 - datasetio
@@ -34,6 +34,11 @@ providers:
     config:
       url: https://api.groq.com
       api_key: ${env.GROQ_API_KEY:}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
@@ -41,7 +46,7 @@ providers:
   - provider_id: sqlite-vec
     provider_type: inline::sqlite-vec
     config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/sqlite_vec.db
   - provider_id: ${env.ENABLE_CHROMADB+chromadb}
     provider_type: remote::chromadb
     config:
@@ -66,14 +71,14 @@ providers:
       persistence_store:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/agents_store.db
   telemetry:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
     config:
       service_name: ${env.OTEL_SERVICE_NAME:}
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/trace_store.db
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/trace_store.db
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -81,7 +86,7 @@ providers:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/meta_reference_eval.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
@@ -89,14 +94,14 @@ providers:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/huggingface_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
     config:
       kvstore:
         type: sqlite
         namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/localfs_datasetio.db
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -127,7 +132,7 @@ providers:
     config: {}
 metadata_store:
   type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/registry.db
 models:
 - metadata: {}
   model_id: openai/gpt-4o
@@ -144,6 +149,76 @@ models:
   provider_id: openai
   provider_model_id: openai/chatgpt-4o-latest
   model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-0125
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-0125
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-instruct
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-instruct
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4
+  provider_id: openai
+  provider_model_id: gpt-4
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4-turbo
+  provider_id: openai
+  provider_model_id: gpt-4-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o
+  provider_id: openai
+  provider_model_id: gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-2024-08-06
+  provider_id: openai
+  provider_model_id: gpt-4o-2024-08-06
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-mini
+  provider_id: openai
+  provider_model_id: gpt-4o-mini
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-audio-preview
+  provider_id: openai
+  provider_model_id: gpt-4o-audio-preview
+  model_type: llm
+- metadata: {}
+  model_id: chatgpt-4o-latest
+  provider_id: openai
+  provider_model_id: chatgpt-4o-latest
+  model_type: llm
+- metadata: {}
+  model_id: o1
+  provider_id: openai
+  provider_model_id: o1
+  model_type: llm
+- metadata: {}
+  model_id: o1-mini
+  provider_id: openai
+  provider_model_id: o1-mini
+  model_type: llm
+- metadata: {}
+  model_id: o3-mini
+  provider_id: openai
+  provider_model_id: o3-mini
+  model_type: llm
+- metadata: {}
+  model_id: o4-mini
+  provider_id: openai
+  provider_model_id: o4-mini
+  model_type: llm
 - metadata:
     embedding_dimension: 1536
     context_length: 8192
@@ -158,6 +233,20 @@ models:
   provider_id: openai
   provider_model_id: openai/text-embedding-3-large
   model_type: embedding
+- metadata:
+    embedding_dimension: 1536
+    context_length: 8192
+  model_id: text-embedding-3-small
+  provider_id: openai
+  provider_model_id: text-embedding-3-small
+  model_type: embedding
+- metadata:
+    embedding_dimension: 3072
+    context_length: 8192
+  model_id: text-embedding-3-large
+  provider_id: openai
+  provider_model_id: text-embedding-3-large
+  model_type: embedding
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   provider_id: fireworks
@@ -413,6 +502,106 @@ models:
   provider_id: groq
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-1B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
 - metadata:
     embedding_dimension: 384
   model_id: all-MiniLM-L6-v2
diff --git a/llama_stack/templates/dev/dev.py b/llama_stack/templates/starter/starter.py
similarity index 90%
rename from llama_stack/templates/dev/dev.py
rename to llama_stack/templates/starter/starter.py
index 4cf8e7d22..0932bfdfe 100644
--- a/llama_stack/templates/dev/dev.py
+++ b/llama_stack/templates/starter/starter.py
@@ -38,10 +38,15 @@ from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
 from llama_stack.providers.remote.inference.openai.models import (
     MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
 )
+from llama_stack.providers.remote.inference.sambanova.config import SambaNovaImplConfig
+from llama_stack.providers.remote.inference.sambanova.models import (
+    MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES,
+)
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.remote.vector_io.pgvector.config import (
     PGVectorVectorIOConfig,
 )
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
 from llama_stack.templates.template import (
     DistributionTemplate,
     RunConfigSettings,
@@ -49,7 +54,7 @@ from llama_stack.templates.template import (
 )
 
 
-def get_inference_providers() -> tuple[list[Provider], list[ModelInput]]:
+def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
     # in this template, we allow each API key to be optional
     providers = [
         (
@@ -77,6 +82,11 @@ def get_inference_providers() -> tuple[list[Provider], list[ModelInput]]:
             GROQ_MODEL_ENTRIES,
             GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"),
         ),
+        (
+            "sambanova",
+            SAMBANOVA_MODEL_ENTRIES,
+            SambaNovaImplConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:}"),
+        ),
     ]
     inference_providers = []
     available_models = {}
@@ -110,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate:
             "remote::model-context-protocol",
         ],
     }
-    name = "dev"
+    name = "starter"
 
     vector_io_providers = [
         Provider(
@@ -162,7 +172,7 @@ def get_distribution_template() -> DistributionTemplate:
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Distribution for running e2e tests in CI",
+        description="Quick start template for running Llama Stack with several popular providers",
         container_image=None,
         template_path=None,
         providers=providers,
diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md
index b69ccaa56..68b475893 100644
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@@ -105,7 +105,7 @@ docker run \
   -v ~/.llama:/root/.llama \
   -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
diff --git a/llama_stack/templates/tgi/report.md b/llama_stack/templates/tgi/report.md
deleted file mode 100644
index b0f5d88a2..000000000
--- a/llama_stack/templates/tgi/report.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Report for tgi distribution
-
-## Supported Models
-| Model Descriptor | tgi |
-|:---|:---|
-| Llama-3-8B-Instruct | ✅ |
-| Llama-3-70B-Instruct | ✅ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ✅ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ✅ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ✅ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|
-| /create_agent_turn | rag | test_rag_agent | ✅ |
-| /create_agent_turn | custom_tool | test_custom_tool | ✅ |
-| /create_agent_turn | code_execution | test_code_interpreter_for_attachments | ✅ |
diff --git a/llama_stack/templates/together/report.md b/llama_stack/templates/together/report.md
deleted file mode 100644
index 71ae83597..000000000
--- a/llama_stack/templates/together/report.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Report for together distribution
-
-## Supported Models
-| Model Descriptor | together |
-|:---|:---|
-| Llama-3-8B-Instruct | ❌ |
-| Llama-3-70B-Instruct | ❌ |
-| Llama3.1-8B-Instruct | ✅ |
-| Llama3.1-70B-Instruct | ✅ |
-| Llama3.1-405B-Instruct | ✅ |
-| Llama3.2-1B-Instruct | ❌ |
-| Llama3.2-3B-Instruct | ✅ |
-| Llama3.2-11B-Vision-Instruct | ✅ |
-| Llama3.2-90B-Vision-Instruct | ✅ |
-| Llama3.3-70B-Instruct | ✅ |
-| Llama-Guard-3-11B-Vision | ✅ |
-| Llama-Guard-3-1B | ❌ |
-| Llama-Guard-3-8B | ✅ |
-| Llama-Guard-2-8B | ❌ |
-
-## Inference
-| Model | API | Capability | Test | Status |
-|:----- |:-----|:-----|:-----|:-----|
-| Llama-3.1-8B-Instruct | /chat_completion | streaming | test_text_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | streaming | test_image_chat_completion_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_non_streaming | ✅ |
-| Llama-3.2-11B-Vision-Instruct | /chat_completion | log_probs | test_completion_log_probs_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | streaming | test_text_completion_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | non_streaming | test_text_completion_non_streaming | ✅ |
-| Llama-3.1-8B-Instruct | /completion | structured_output | test_text_completion_structured_output | ✅ |
-
-## Vector IO
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::faiss | /retrieve |  | test_vector_db_retrieve | ✅ |
-
-## Agents
-| Provider | API | Capability | Test | Status |
-|:-----|:-----|:-----|:-----|:-----|
-| inline::meta-reference | /create_agent_turn | rag | test_rag_agent | ✅ |
-| inline::meta-reference | /create_agent_turn | custom_tool | test_custom_tool | ✅ |
diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml
index fbe491453..11af41da9 100644
--- a/llama_stack/templates/verification/run.yaml
+++ b/llama_stack/templates/verification/run.yaml
@@ -151,6 +151,76 @@ models:
   provider_id: openai
   provider_model_id: openai/chatgpt-4o-latest
   model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-0125
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-0125
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-3.5-turbo-instruct
+  provider_id: openai
+  provider_model_id: gpt-3.5-turbo-instruct
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4
+  provider_id: openai
+  provider_model_id: gpt-4
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4-turbo
+  provider_id: openai
+  provider_model_id: gpt-4-turbo
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o
+  provider_id: openai
+  provider_model_id: gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-2024-08-06
+  provider_id: openai
+  provider_model_id: gpt-4o-2024-08-06
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-mini
+  provider_id: openai
+  provider_model_id: gpt-4o-mini
+  model_type: llm
+- metadata: {}
+  model_id: gpt-4o-audio-preview
+  provider_id: openai
+  provider_model_id: gpt-4o-audio-preview
+  model_type: llm
+- metadata: {}
+  model_id: chatgpt-4o-latest
+  provider_id: openai
+  provider_model_id: chatgpt-4o-latest
+  model_type: llm
+- metadata: {}
+  model_id: o1
+  provider_id: openai
+  provider_model_id: o1
+  model_type: llm
+- metadata: {}
+  model_id: o1-mini
+  provider_id: openai
+  provider_model_id: o1-mini
+  model_type: llm
+- metadata: {}
+  model_id: o3-mini
+  provider_id: openai
+  provider_model_id: o3-mini
+  model_type: llm
+- metadata: {}
+  model_id: o4-mini
+  provider_id: openai
+  provider_model_id: o4-mini
+  model_type: llm
 - metadata:
     embedding_dimension: 1536
     context_length: 8192
@@ -165,6 +235,20 @@ models:
   provider_id: openai
   provider_model_id: openai/text-embedding-3-large
   model_type: embedding
+- metadata:
+    embedding_dimension: 1536
+    context_length: 8192
+  model_id: text-embedding-3-small
+  provider_id: openai
+  provider_model_id: text-embedding-3-small
+  model_type: embedding
+- metadata:
+    embedding_dimension: 3072
+    context_length: 8192
+  model_id: text-embedding-3-large
+  provider_id: openai
+  provider_model_id: text-embedding-3-large
+  model_type: embedding
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p1-8b-instruct
   provider_id: fireworks-openai-compat
@@ -502,104 +586,104 @@ models:
   provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-8B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-8B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.1-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-70B-Instruct
-  model_type: llm
-- metadata: {}
-  model_id: Meta-Llama-3.1-405B-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.1-405B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-1B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-1B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-1B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.2-3B-Instruct
+  model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-3B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.2-3B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-3.3-70B-Instruct
+  model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.3-70B-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-3.3-70B-Instruct
+  provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-11B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-11B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Llama-3.2-90B-Vision-Instruct
+  model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-3.2-90B-Vision-Instruct
+  provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct
   model_type: llm
 - metadata: {}
-  model_id: Meta-Llama-Guard-3-8B
+  model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: sambanova-openai-compat
-  provider_model_id: Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: Llama-4-Scout-17B-16E-Instruct
-  provider_id: sambanova-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
   model_type: llm
 - metadata: {}
   model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
   provider_id: sambanova-openai-compat
-  provider_model_id: Llama-4-Scout-17B-16E-Instruct
+  provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct
+  model_type: llm
+- metadata: {}
+  model_id: sambanova/Meta-Llama-Guard-3-8B
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: sambanova-openai-compat
+  provider_model_id: sambanova/Meta-Llama-Guard-3-8B
   model_type: llm
 - metadata: {}
   model_id: llama3.1-8b
diff --git a/llama_stack/templates/watsonx/build.yaml b/llama_stack/templates/watsonx/build.yaml
index 23a1ffa74..638b16029 100644
--- a/llama_stack/templates/watsonx/build.yaml
+++ b/llama_stack/templates/watsonx/build.yaml
@@ -4,6 +4,7 @@ distribution_spec:
   providers:
     inference:
     - remote::watsonx
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     safety:
diff --git a/llama_stack/templates/watsonx/doc_template.md b/llama_stack/templates/watsonx/doc_template.md
index af0ae15a8..f28dbf0bf 100644
--- a/llama_stack/templates/watsonx/doc_template.md
+++ b/llama_stack/templates/watsonx/doc_template.md
@@ -56,7 +56,7 @@ docker run \
   -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ./run.yaml:/root/my-run.yaml \
   llamastack/distribution-{{ name }} \
-  --yaml-config /root/my-run.yaml \
+  --config /root/my-run.yaml \
   --port $LLAMA_STACK_PORT \
   --env WATSONX_API_KEY=$WATSONX_API_KEY \
   --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
diff --git a/llama_stack/templates/watsonx/run.yaml b/llama_stack/templates/watsonx/run.yaml
index dddb0670c..8de6a2b6c 100644
--- a/llama_stack/templates/watsonx/run.yaml
+++ b/llama_stack/templates/watsonx/run.yaml
@@ -18,6 +18,9 @@ providers:
       url: ${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}
       api_key: ${env.WATSONX_API_KEY:}
       project_id: ${env.WATSONX_PROJECT_ID:}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -191,6 +194,11 @@ models:
   provider_id: watsonx
   provider_model_id: meta-llama/llama-guard-3-11b-vision
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields: []
 vector_dbs: []
 datasets: []
diff --git a/llama_stack/templates/watsonx/watsonx.py b/llama_stack/templates/watsonx/watsonx.py
index f16593051..802aaf8f1 100644
--- a/llama_stack/templates/watsonx/watsonx.py
+++ b/llama_stack/templates/watsonx/watsonx.py
@@ -6,7 +6,11 @@
 
 from pathlib import Path
 
-from llama_stack.distribution.datatypes import Provider, ToolGroupInput
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
 from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
 from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
@@ -14,7 +18,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
 
 def get_distribution_template() -> DistributionTemplate:
     providers = {
-        "inference": ["remote::watsonx"],
+        "inference": ["remote::watsonx", "inline::sentence-transformers"],
         "vector_io": ["inline::faiss"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
@@ -36,6 +40,12 @@ def get_distribution_template() -> DistributionTemplate:
         config=WatsonXConfig.sample_run_config(),
     )
 
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+
     available_models = {
         "watsonx": MODEL_ENTRIES,
     }
@@ -50,6 +60,15 @@ def get_distribution_template() -> DistributionTemplate:
         ),
     ]
 
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+
     default_models = get_model_registry(available_models)
     return DistributionTemplate(
         name="watsonx",
@@ -62,9 +81,9 @@ def get_distribution_template() -> DistributionTemplate:
         run_configs={
             "run.yaml": RunConfigSettings(
                 provider_overrides={
-                    "inference": [inference_provider],
+                    "inference": [inference_provider, embedding_provider],
                 },
-                default_models=default_models,
+                default_models=default_models + [embedding_model],
                 default_tool_groups=default_tool_groups,
             ),
         },
diff --git a/llama_stack/ui/.gitignore b/llama_stack/ui/.gitignore
new file mode 100644
index 000000000..5ef6a5207
--- /dev/null
+++ b/llama_stack/ui/.gitignore
@@ -0,0 +1,41 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/versions
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+
+# env files (can opt-in for committing if needed)
+.env*
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/llama_stack/ui/README.md b/llama_stack/ui/README.md
new file mode 100644
index 000000000..e3e21bf0b
--- /dev/null
+++ b/llama_stack/ui/README.md
@@ -0,0 +1,26 @@
+## This is WIP.
+
+
+We use shadcdn/ui [Shadcn UI](https://ui.shadcn.com/) for the UI components.
+
+## Getting Started
+
+First, install dependencies:
+
+```bash
+npm install next react react-dom
+```
+
+Then, run the development server:
+
+```bash
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
+# or
+bun dev
+```
+
+Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
diff --git a/llama_stack/ui/app/favicon.ico b/llama_stack/ui/app/favicon.ico
new file mode 100644
index 000000000..718d6fea4
Binary files /dev/null and b/llama_stack/ui/app/favicon.ico differ
diff --git a/llama_stack/ui/app/globals.css b/llama_stack/ui/app/globals.css
new file mode 100644
index 000000000..dc98be74c
--- /dev/null
+++ b/llama_stack/ui/app/globals.css
@@ -0,0 +1,122 @@
+@import "tailwindcss";
+@import "tw-animate-css";
+
+@custom-variant dark (&:is(.dark *));
+
+@theme inline {
+  --color-background: var(--background);
+  --color-foreground: var(--foreground);
+  --font-sans: var(--font-geist-sans);
+  --font-mono: var(--font-geist-mono);
+  --color-sidebar-ring: var(--sidebar-ring);
+  --color-sidebar-border: var(--sidebar-border);
+  --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
+  --color-sidebar-accent: var(--sidebar-accent);
+  --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
+  --color-sidebar-primary: var(--sidebar-primary);
+  --color-sidebar-foreground: var(--sidebar-foreground);
+  --color-sidebar: var(--sidebar);
+  --color-chart-5: var(--chart-5);
+  --color-chart-4: var(--chart-4);
+  --color-chart-3: var(--chart-3);
+  --color-chart-2: var(--chart-2);
+  --color-chart-1: var(--chart-1);
+  --color-ring: var(--ring);
+  --color-input: var(--input);
+  --color-border: var(--border);
+  --color-destructive: var(--destructive);
+  --color-accent-foreground: var(--accent-foreground);
+  --color-accent: var(--accent);
+  --color-muted-foreground: var(--muted-foreground);
+  --color-muted: var(--muted);
+  --color-secondary-foreground: var(--secondary-foreground);
+  --color-secondary: var(--secondary);
+  --color-primary-foreground: var(--primary-foreground);
+  --color-primary: var(--primary);
+  --color-popover-foreground: var(--popover-foreground);
+  --color-popover: var(--popover);
+  --color-card-foreground: var(--card-foreground);
+  --color-card: var(--card);
+  --radius-sm: calc(var(--radius) - 4px);
+  --radius-md: calc(var(--radius) - 2px);
+  --radius-lg: var(--radius);
+  --radius-xl: calc(var(--radius) + 4px);
+}
+
+:root {
+  --radius: 0.625rem;
+  --background: oklch(1 0 0);
+  --foreground: oklch(0.145 0 0);
+  --card: oklch(1 0 0);
+  --card-foreground: oklch(0.145 0 0);
+  --popover: oklch(1 0 0);
+  --popover-foreground: oklch(0.145 0 0);
+  --primary: oklch(0.205 0 0);
+  --primary-foreground: oklch(0.985 0 0);
+  --secondary: oklch(0.97 0 0);
+  --secondary-foreground: oklch(0.205 0 0);
+  --muted: oklch(0.97 0 0);
+  --muted-foreground: oklch(0.556 0 0);
+  --accent: oklch(0.97 0 0);
+  --accent-foreground: oklch(0.205 0 0);
+  --destructive: oklch(0.577 0.245 27.325);
+  --border: oklch(0.922 0 0);
+  --input: oklch(0.922 0 0);
+  --ring: oklch(0.708 0 0);
+  --chart-1: oklch(0.646 0.222 41.116);
+  --chart-2: oklch(0.6 0.118 184.704);
+  --chart-3: oklch(0.398 0.07 227.392);
+  --chart-4: oklch(0.828 0.189 84.429);
+  --chart-5: oklch(0.769 0.188 70.08);
+  --sidebar: oklch(0.985 0 0);
+  --sidebar-foreground: oklch(0.145 0 0);
+  --sidebar-primary: oklch(0.205 0 0);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.97 0 0);
+  --sidebar-accent-foreground: oklch(0.205 0 0);
+  --sidebar-border: oklch(0.922 0 0);
+  --sidebar-ring: oklch(0.708 0 0);
+}
+
+.dark {
+  --background: oklch(0.145 0 0);
+  --foreground: oklch(0.985 0 0);
+  --card: oklch(0.205 0 0);
+  --card-foreground: oklch(0.985 0 0);
+  --popover: oklch(0.205 0 0);
+  --popover-foreground: oklch(0.985 0 0);
+  --primary: oklch(0.922 0 0);
+  --primary-foreground: oklch(0.205 0 0);
+  --secondary: oklch(0.269 0 0);
+  --secondary-foreground: oklch(0.985 0 0);
+  --muted: oklch(0.269 0 0);
+  --muted-foreground: oklch(0.708 0 0);
+  --accent: oklch(0.269 0 0);
+  --accent-foreground: oklch(0.985 0 0);
+  --destructive: oklch(0.704 0.191 22.216);
+  --border: oklch(1 0 0 / 10%);
+  --input: oklch(1 0 0 / 15%);
+  --ring: oklch(0.556 0 0);
+  --chart-1: oklch(0.488 0.243 264.376);
+  --chart-2: oklch(0.696 0.17 162.48);
+  --chart-3: oklch(0.769 0.188 70.08);
+  --chart-4: oklch(0.627 0.265 303.9);
+  --chart-5: oklch(0.645 0.246 16.439);
+  --sidebar: oklch(0.205 0 0);
+  --sidebar-foreground: oklch(0.985 0 0);
+  --sidebar-primary: oklch(0.488 0.243 264.376);
+  --sidebar-primary-foreground: oklch(0.985 0 0);
+  --sidebar-accent: oklch(0.269 0 0);
+  --sidebar-accent-foreground: oklch(0.985 0 0);
+  --sidebar-border: oklch(1 0 0 / 10%);
+  --sidebar-ring: oklch(0.556 0 0);
+}
+
+@layer base {
+  * {
+    @apply border-border outline-ring/50;
+  }
+  body {
+    @apply bg-background text-foreground;
+  }
+}
diff --git a/llama_stack/ui/app/layout.tsx b/llama_stack/ui/app/layout.tsx
new file mode 100644
index 000000000..f029002dd
--- /dev/null
+++ b/llama_stack/ui/app/layout.tsx
@@ -0,0 +1,55 @@
+import type { Metadata } from "next";
+import { ThemeProvider } from "@/components/ui/theme-provider";
+import { Geist, Geist_Mono } from "next/font/google";
+import { ModeToggle } from "@/components/ui/mode-toggle";
+import "./globals.css";
+
+const geistSans = Geist({
+  variable: "--font-geist-sans",
+  subsets: ["latin"],
+});
+
+const geistMono = Geist_Mono({
+  variable: "--font-geist-mono",
+  subsets: ["latin"],
+});
+
+export const metadata: Metadata = {
+  title: "Llama Stack",
+  description: "Llama Stack UI",
+};
+
+import { SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
+import { AppSidebar } from "@/components/app-sidebar";
+
+export default function Layout({ children }: { children: React.ReactNode }) {
+  return (
+    
+      
+        
+          
+            
+            
+ {/* Header with aligned elements */} +
+
+ +
+
+
+ +
+
+
{children}
+
+
+
+ + + ); +} diff --git a/llama_stack/ui/app/logs/chat-completions/page.tsx b/llama_stack/ui/app/logs/chat-completions/page.tsx new file mode 100644 index 000000000..84cceb8b7 --- /dev/null +++ b/llama_stack/ui/app/logs/chat-completions/page.tsx @@ -0,0 +1,7 @@ +export default function ChatCompletions() { + return ( +
+

Under Construction

+
+ ); +} diff --git a/llama_stack/ui/app/logs/responses/page.tsx b/llama_stack/ui/app/logs/responses/page.tsx new file mode 100644 index 000000000..cdc165d08 --- /dev/null +++ b/llama_stack/ui/app/logs/responses/page.tsx @@ -0,0 +1,7 @@ +export default function Responses() { + return ( +
+

Under Construction

+
+ ); +} diff --git a/llama_stack/ui/app/page.tsx b/llama_stack/ui/app/page.tsx new file mode 100644 index 000000000..d1d781bdb --- /dev/null +++ b/llama_stack/ui/app/page.tsx @@ -0,0 +1,7 @@ +export default function Home() { + return ( +
+

Welcome to Llama Stack!

+
+ ); +} diff --git a/llama_stack/ui/components.json b/llama_stack/ui/components.json new file mode 100644 index 000000000..4ee62ee10 --- /dev/null +++ b/llama_stack/ui/components.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "new-york", + "rsc": true, + "tsx": true, + "tailwind": { + "config": "", + "css": "app/globals.css", + "baseColor": "neutral", + "cssVariables": true, + "prefix": "" + }, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "iconLibrary": "lucide" +} diff --git a/llama_stack/ui/components/app-sidebar.tsx b/llama_stack/ui/components/app-sidebar.tsx new file mode 100644 index 000000000..3d541856f --- /dev/null +++ b/llama_stack/ui/components/app-sidebar.tsx @@ -0,0 +1,61 @@ +import { MessageSquareText, MessagesSquare, MoveUpRight } from "lucide-react"; +import Link from "next/link"; + +import { + Sidebar, + SidebarContent, + SidebarGroup, + SidebarGroupContent, + SidebarGroupLabel, + SidebarMenu, + SidebarMenuButton, + SidebarMenuItem, + SidebarHeader, +} from "@/components/ui/sidebar"; + +const logItems = [ + { + title: "Chat Completions", + url: "/logs/chat-completions", + icon: MessageSquareText, + }, + { + title: "Responses", + url: "/logs/responses", + icon: MessagesSquare, + }, + { + title: "Documentation", + url: "https://llama-stack.readthedocs.io/en/latest/references/api_reference/index.html", + icon: MoveUpRight, + }, +]; + +export function AppSidebar() { + return ( + + + Llama Stack + + + + Logs + + + {logItems.map((item) => ( + + + + + {item.title} + + + + ))} + + + + + + ); +} diff --git a/llama_stack/ui/components/ui/button.tsx b/llama_stack/ui/components/ui/button.tsx new file mode 100644 index 000000000..2adaf00da --- /dev/null +++ b/llama_stack/ui/components/ui/button.tsx @@ -0,0 +1,59 @@ +import * as React from "react"; +import { Slot } from "@radix-ui/react-slot"; +import { cva, type VariantProps } from "class-variance-authority"; + +import { cn } from "@/lib/utils"; + +const buttonVariants = cva( + "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg:not([class*='size-'])]:size-4 shrink-0 [&_svg]:shrink-0 outline-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive", + { + variants: { + variant: { + default: + "bg-primary text-primary-foreground shadow-xs hover:bg-primary/90", + destructive: + "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60", + outline: + "border bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50", + secondary: + "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80", + ghost: + "hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50", + link: "text-primary underline-offset-4 hover:underline", + }, + size: { + default: "h-9 px-4 py-2 has-[>svg]:px-3", + sm: "h-8 rounded-md gap-1.5 px-3 has-[>svg]:px-2.5", + lg: "h-10 rounded-md px-6 has-[>svg]:px-4", + icon: "size-9", + }, + }, + defaultVariants: { + variant: "default", + size: "default", + }, + }, +); + +function Button({ + className, + variant, + size, + asChild = false, + ...props +}: React.ComponentProps<"button"> & + VariantProps & { + asChild?: boolean; + }) { + const Comp = asChild ? Slot : "button"; + + return ( + + ); +} + +export { Button, buttonVariants }; diff --git a/llama_stack/ui/components/ui/dropdown-menu.tsx b/llama_stack/ui/components/ui/dropdown-menu.tsx new file mode 100644 index 000000000..1fc1f4ee3 --- /dev/null +++ b/llama_stack/ui/components/ui/dropdown-menu.tsx @@ -0,0 +1,257 @@ +"use client"; + +import * as React from "react"; +import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"; +import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"; + +import { cn } from "@/lib/utils"; + +function DropdownMenu({ + ...props +}: React.ComponentProps) { + return ; +} + +function DropdownMenuPortal({ + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function DropdownMenuTrigger({ + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function DropdownMenuContent({ + className, + sideOffset = 4, + ...props +}: React.ComponentProps) { + return ( + + + + ); +} + +function DropdownMenuGroup({ + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function DropdownMenuItem({ + className, + inset, + variant = "default", + ...props +}: React.ComponentProps & { + inset?: boolean; + variant?: "default" | "destructive"; +}) { + return ( + + ); +} + +function DropdownMenuCheckboxItem({ + className, + children, + checked, + ...props +}: React.ComponentProps) { + return ( + + + + + + + {children} + + ); +} + +function DropdownMenuRadioGroup({ + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function DropdownMenuRadioItem({ + className, + children, + ...props +}: React.ComponentProps) { + return ( + + + + + + + {children} + + ); +} + +function DropdownMenuLabel({ + className, + inset, + ...props +}: React.ComponentProps & { + inset?: boolean; +}) { + return ( + + ); +} + +function DropdownMenuSeparator({ + className, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function DropdownMenuShortcut({ + className, + ...props +}: React.ComponentProps<"span">) { + return ( + + ); +} + +function DropdownMenuSub({ + ...props +}: React.ComponentProps) { + return ; +} + +function DropdownMenuSubTrigger({ + className, + inset, + children, + ...props +}: React.ComponentProps & { + inset?: boolean; +}) { + return ( + + {children} + + + ); +} + +function DropdownMenuSubContent({ + className, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +export { + DropdownMenu, + DropdownMenuPortal, + DropdownMenuTrigger, + DropdownMenuContent, + DropdownMenuGroup, + DropdownMenuLabel, + DropdownMenuItem, + DropdownMenuCheckboxItem, + DropdownMenuRadioGroup, + DropdownMenuRadioItem, + DropdownMenuSeparator, + DropdownMenuShortcut, + DropdownMenuSub, + DropdownMenuSubTrigger, + DropdownMenuSubContent, +}; diff --git a/llama_stack/ui/components/ui/input.tsx b/llama_stack/ui/components/ui/input.tsx new file mode 100644 index 000000000..b1a060f50 --- /dev/null +++ b/llama_stack/ui/components/ui/input.tsx @@ -0,0 +1,21 @@ +import * as React from "react"; + +import { cn } from "@/lib/utils"; + +function Input({ className, type, ...props }: React.ComponentProps<"input">) { + return ( + + ); +} + +export { Input }; diff --git a/llama_stack/ui/components/ui/mode-toggle.tsx b/llama_stack/ui/components/ui/mode-toggle.tsx new file mode 100644 index 000000000..92640161e --- /dev/null +++ b/llama_stack/ui/components/ui/mode-toggle.tsx @@ -0,0 +1,40 @@ +"use client"; + +import * as React from "react"; +import { Moon, Sun } from "lucide-react"; +import { useTheme } from "next-themes"; + +import { Button } from "@/components/ui/button"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; + +export function ModeToggle() { + const { setTheme } = useTheme(); + + return ( + + + + + + setTheme("light")}> + Light + + setTheme("dark")}> + Dark + + setTheme("system")}> + System + + + + ); +} diff --git a/llama_stack/ui/components/ui/separator.tsx b/llama_stack/ui/components/ui/separator.tsx new file mode 100644 index 000000000..06d1380a9 --- /dev/null +++ b/llama_stack/ui/components/ui/separator.tsx @@ -0,0 +1,28 @@ +"use client"; + +import * as React from "react"; +import * as SeparatorPrimitive from "@radix-ui/react-separator"; + +import { cn } from "@/lib/utils"; + +function Separator({ + className, + orientation = "horizontal", + decorative = true, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +export { Separator }; diff --git a/llama_stack/ui/components/ui/sheet.tsx b/llama_stack/ui/components/ui/sheet.tsx new file mode 100644 index 000000000..d30779f4f --- /dev/null +++ b/llama_stack/ui/components/ui/sheet.tsx @@ -0,0 +1,139 @@ +"use client"; + +import * as React from "react"; +import * as SheetPrimitive from "@radix-ui/react-dialog"; +import { XIcon } from "lucide-react"; + +import { cn } from "@/lib/utils"; + +function Sheet({ ...props }: React.ComponentProps) { + return ; +} + +function SheetTrigger({ + ...props +}: React.ComponentProps) { + return ; +} + +function SheetClose({ + ...props +}: React.ComponentProps) { + return ; +} + +function SheetPortal({ + ...props +}: React.ComponentProps) { + return ; +} + +function SheetOverlay({ + className, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function SheetContent({ + className, + children, + side = "right", + ...props +}: React.ComponentProps & { + side?: "top" | "right" | "bottom" | "left"; +}) { + return ( + + + + {children} + + + Close + + + + ); +} + +function SheetHeader({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ); +} + +function SheetFooter({ className, ...props }: React.ComponentProps<"div">) { + return ( +
+ ); +} + +function SheetTitle({ + className, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +function SheetDescription({ + className, + ...props +}: React.ComponentProps) { + return ( + + ); +} + +export { + Sheet, + SheetTrigger, + SheetClose, + SheetContent, + SheetHeader, + SheetFooter, + SheetTitle, + SheetDescription, +}; diff --git a/llama_stack/ui/components/ui/sidebar.tsx b/llama_stack/ui/components/ui/sidebar.tsx new file mode 100644 index 000000000..f8a0a3ed5 --- /dev/null +++ b/llama_stack/ui/components/ui/sidebar.tsx @@ -0,0 +1,726 @@ +"use client"; + +import * as React from "react"; +import { Slot } from "@radix-ui/react-slot"; +import { VariantProps, cva } from "class-variance-authority"; +import { PanelLeftIcon } from "lucide-react"; + +import { useIsMobile } from "@/hooks/use-mobile"; +import { cn } from "@/lib/utils"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Separator } from "@/components/ui/separator"; +import { + Sheet, + SheetContent, + SheetDescription, + SheetHeader, + SheetTitle, +} from "@/components/ui/sheet"; +import { Skeleton } from "@/components/ui/skeleton"; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from "@/components/ui/tooltip"; + +const SIDEBAR_COOKIE_NAME = "sidebar_state"; +const SIDEBAR_COOKIE_MAX_AGE = 60 * 60 * 24 * 7; +const SIDEBAR_WIDTH = "16rem"; +const SIDEBAR_WIDTH_MOBILE = "18rem"; +const SIDEBAR_WIDTH_ICON = "3rem"; +const SIDEBAR_KEYBOARD_SHORTCUT = "b"; + +type SidebarContextProps = { + state: "expanded" | "collapsed"; + open: boolean; + setOpen: (open: boolean) => void; + openMobile: boolean; + setOpenMobile: (open: boolean) => void; + isMobile: boolean; + toggleSidebar: () => void; +}; + +const SidebarContext = React.createContext(null); + +function useSidebar() { + const context = React.useContext(SidebarContext); + if (!context) { + throw new Error("useSidebar must be used within a SidebarProvider."); + } + + return context; +} + +function SidebarProvider({ + defaultOpen = true, + open: openProp, + onOpenChange: setOpenProp, + className, + style, + children, + ...props +}: React.ComponentProps<"div"> & { + defaultOpen?: boolean; + open?: boolean; + onOpenChange?: (open: boolean) => void; +}) { + const isMobile = useIsMobile(); + const [openMobile, setOpenMobile] = React.useState(false); + + // This is the internal state of the sidebar. + // We use openProp and setOpenProp for control from outside the component. + const [_open, _setOpen] = React.useState(defaultOpen); + const open = openProp ?? _open; + const setOpen = React.useCallback( + (value: boolean | ((value: boolean) => boolean)) => { + const openState = typeof value === "function" ? value(open) : value; + if (setOpenProp) { + setOpenProp(openState); + } else { + _setOpen(openState); + } + + // This sets the cookie to keep the sidebar state. + document.cookie = `${SIDEBAR_COOKIE_NAME}=${openState}; path=/; max-age=${SIDEBAR_COOKIE_MAX_AGE}`; + }, + [setOpenProp, open], + ); + + // Helper to toggle the sidebar. + const toggleSidebar = React.useCallback(() => { + return isMobile ? setOpenMobile((open) => !open) : setOpen((open) => !open); + }, [isMobile, setOpen, setOpenMobile]); + + // Adds a keyboard shortcut to toggle the sidebar. + React.useEffect(() => { + const handleKeyDown = (event: KeyboardEvent) => { + if ( + event.key === SIDEBAR_KEYBOARD_SHORTCUT && + (event.metaKey || event.ctrlKey) + ) { + event.preventDefault(); + toggleSidebar(); + } + }; + + window.addEventListener("keydown", handleKeyDown); + return () => window.removeEventListener("keydown", handleKeyDown); + }, [toggleSidebar]); + + // We add a state so that we can do data-state="expanded" or "collapsed". + // This makes it easier to style the sidebar with Tailwind classes. + const state = open ? "expanded" : "collapsed"; + + const contextValue = React.useMemo( + () => ({ + state, + open, + setOpen, + isMobile, + openMobile, + setOpenMobile, + toggleSidebar, + }), + [state, open, setOpen, isMobile, openMobile, setOpenMobile, toggleSidebar], + ); + + return ( + + +
+ {children} +
+
+
+ ); +} + +function Sidebar({ + side = "left", + variant = "sidebar", + collapsible = "offcanvas", + className, + children, + ...props +}: React.ComponentProps<"div"> & { + side?: "left" | "right"; + variant?: "sidebar" | "floating" | "inset"; + collapsible?: "offcanvas" | "icon" | "none"; +}) { + const { isMobile, state, openMobile, setOpenMobile } = useSidebar(); + + if (collapsible === "none") { + return ( +
+ {children} +
+ ); + } + + if (isMobile) { + return ( + + + + Sidebar + Displays the mobile sidebar. + +
{children}
+
+
+ ); + } + + return ( +
+ {/* This is what handles the sidebar gap on desktop */} +
+ +
+ ); +} + +function SidebarTrigger({ + className, + onClick, + ...props +}: React.ComponentProps) { + const { toggleSidebar } = useSidebar(); + + return ( + + ); +} + +function SidebarRail({ className, ...props }: React.ComponentProps<"button">) { + const { toggleSidebar } = useSidebar(); + + return ( +