Merge branch 'main' into routeur

2025-12-03 18:00:36 +00:00 · 2025-11-24 14:58:43 +01:00 · 2025-11-24 14:58:43 +01:00 · 3770963130
commit 3770963130
parent 6d76a63eb7 3434c92a14
255 changed files with 18366 additions and 1909 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
+* @ashwinb @raghotham @ehhuang @leseb @bbrowning @mattf @franciscojavierarceo @cdoern
--- a/.github/actions/setup-typescript-client/action.yml
+++ b/.github/actions/setup-typescript-client/action.yml
@ -0,0 +1,35 @@
+name: Setup TypeScript client
+description: Conditionally checkout and link llama-stack-client-typescript based on client-version
+inputs:
+  client-version:
+    description: 'Client version (latest or published)'
+    required: true
+
+outputs:
+  ts-client-path:
+    description: 'Path or version to use for TypeScript client'
+    value: ${{ steps.set-path.outputs.ts-client-path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Checkout TypeScript client (latest)
+      if: ${{ inputs.client-version == 'latest' }}
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      with:
+        repository: llamastack/llama-stack-client-typescript
+        ref: main
+        path: .ts-client-checkout
+
+    - name: Set TS_CLIENT_PATH
+      id: set-path
+      shell: bash
+      run: |
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
+          echo "ts-client-path=${{ github.workspace }}/.ts-client-checkout" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          echo "ts-client-path=^0.3.2" >> $GITHUB_OUTPUT
+        else
+          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -93,11 +93,27 @@ jobs:
          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'

+      - name: Setup Node.js for TypeScript client tests
+        if: ${{ matrix.client == 'server' }}
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: tests/integration/client-typescript/package-lock.json
+
+      - name: Setup TypeScript client
+        if: ${{ matrix.client == 'server' }}
+        id: setup-ts-client
+        uses: ./.github/actions/setup-typescript-client
+        with:
+          client-version: ${{ matrix.client-version }}
+
      - name: Run tests
        if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
        uses: ./.github/actions/run-and-record-tests
        env:
          OPENAI_API_KEY: dummy
+          TS_CLIENT_PATH: ${{ steps.setup-ts-client.outputs.ts-client-path || '' }}
        with:
          stack-config: >-
            ${{ matrix.config.stack_config
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -43,7 +43,41 @@ env:
  #   Stainless organization dashboard

 jobs:
+  compute-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      preview_branch: ${{ steps.compute.outputs.preview_branch }}
+      base_branch: ${{ steps.compute.outputs.base_branch }}
+      merge_branch: ${{ steps.compute.outputs.merge_branch }}
+    steps:
+      - name: Compute branch names
+        id: compute
+        run: |
+          HEAD_REPO="${{ github.event.pull_request.head.repo.full_name }}"
+          BASE_REPO="${{ github.repository }}"
+          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
+          FORK_OWNER="${{ github.event.pull_request.head.repo.owner.login }}"
+
+          if [ "$HEAD_REPO" != "$BASE_REPO" ]; then
+            # Fork PR: prefix with fork owner for isolation
+            if [ -z "$FORK_OWNER" ]; then
+              echo "Error: Fork PR detected but fork owner is empty" >&2
+              exit 1
+            fi
+            PREVIEW_BRANCH="preview/${FORK_OWNER}/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${FORK_OWNER}/${BRANCH_NAME}"
+          else
+            # Same-repo PR
+            PREVIEW_BRANCH="preview/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${BRANCH_NAME}"
+          fi
+
+          echo "preview_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
+          echo "base_branch=${BASE_BRANCH}" >> $GITHUB_OUTPUT
+          echo "merge_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
+
  preview:
+    needs: compute-branch
    if: github.event.action != 'closed'
    runs-on: ubuntu-latest
    permissions:
@ -59,8 +93,6 @@ jobs:
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2

-      # This action builds preview SDKs from the OpenAPI spec changes and
-      # posts/updates a comment on the PR with build results and links to the preview.
      - name: Run preview builds
        uses: stainless-api/upload-openapi-spec-action/preview@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
@ -73,8 +105,11 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
+          branch: ${{ needs.compute-branch.outputs.preview_branch }}
+          base_branch: ${{ needs.compute-branch.outputs.base_branch }}

  merge:
+    needs: compute-branch
    if: github.event.action == 'closed' && github.event.pull_request.merged == true
    runs-on: ubuntu-latest
    permissions:
@ -91,11 +126,11 @@ jobs:
          fetch-depth: 2

      # Note that this only merges in changes that happened on the last build on
-      # preview/${{ github.head_ref }}. It's possible that there are OAS/config
-      # changes that haven't been built, if the preview-sdk job didn't finish
+      # the computed preview branch. It's possible that there are OAS/config
+      # changes that haven't been built, if the preview job didn't finish
      # before this step starts. In theory we want to wait for all builds
-      # against preview/${{ github.head_ref }} to complete, but assuming that
-      # the preview-sdk job happens before the PR merge, it should be fine.
+      # against the preview branch to complete, but assuming that
+      # the preview job happens before the PR merge, it should be fine.
      - name: Run merge build
        uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
@ -108,3 +143,4 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
+          merge_branch: ${{ needs.compute-branch.outputs.merge_branch }}
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,5 @@ docs/static/imported-files/
 docs/docs/api-deprecated/
 docs/docs/api-experimental/
 docs/docs/api/
+tests/integration/client-typescript/node_modules/
+.ts-client-checkout/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -199,6 +199,27 @@ repos:
              echo;
              exit 1;
            } || true
+      - id: check-api-independence
+        name: Ensure llama_stack_api does not import llama_stack
+        entry: bash
+        language: system
+        pass_filenames: false
+        require_serial: true
+        always_run: true
+        files: ^src/llama_stack_api/.*$
+        args:
+          - -c
+          - |
+            API_DIR="src/llama_stack_api"
+            grep -rn --include="*.py" -E '^[^#]*(import llama_stack\b|from llama_stack\b)' "$API_DIR" 2>/dev/null && {
+              echo "llama_stack_api must not import llama_stack";
+              exit 1;
+            }
+            [ -f "$API_DIR/pyproject.toml" ] && grep -n 'llama_stack[^_]' "$API_DIR/pyproject.toml" && {
+              echo "llama_stack_api must not depend on llama_stack in pyproject.toml";
+              exit 1;
+            }
+            exit 0

 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
--- a/README.md
+++ b/README.md
@ -10,83 +10,6 @@
 [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


-### ✨🎉 Llama 4 Support  🎉✨
-We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
-
-<details>
-
-<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
-
-\
-*Note you need 8xH100 GPU-host to run these models*
-
-```bash
-pip install -U llama_stack
-
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-# get meta url from llama.com
-huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
-
-# install dependencies for the distribution
-llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
-
-# start a llama stack server
-INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
-
-# install client to interact with the server
-pip install llama-stack-client
-```
-### CLI
-```bash
-# Run a chat completion
-MODEL="Llama-4-Scout-17B-16E-Instruct"
-
-llama-stack-client --endpoint http://localhost:8321 \
-inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
-
-OpenAIChatCompletion(
-    ...
-    choices=[
-        OpenAIChatCompletionChoice(
-            finish_reason='stop',
-            index=0,
-            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
-                role='assistant',
-                content='...**Silent minds awaken,**  \n**Whispers of billions of words,**  \n**Reasoning breaks the night.**  \n\n—  \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
-                ...
-            ),
-            ...
-        )
-    ],
-    ...
-)
-```
-### Python SDK
-```python
-from llama_stack_client import LlamaStackClient
-
-client = LlamaStackClient(base_url=f"http://localhost:8321")
-
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-prompt = "Write a haiku about coding"
-
-print(f"User> {prompt}")
-response = client.chat.completions.create(
-    model=model_id,
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ],
-)
-print(f"Assistant> {response.choices[0].message.content}")
-```
-As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
-
-
-</details>
-
 ### 🚀 One-Line Installer 🚀

 To try Llama Stack locally, run:
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -5,4 +5,7 @@ These are the source-of-truth configuration files used to generate the Stainless

 A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.

-These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `scripts/run_openapi_generator.sh` script.
+These files go hand-in-hand. Both `openapi.yml` and `config.yml` are generated by `scripts/run_openapi_generator.sh`:
+
+- `openapi.yml` comes from the FastAPI-based generator.
+- `config.yml` is rendered from `scripts/openapi_generator/stainless_config/config_data.py` so the Stainless config stays in lock-step with the spec.
--- a/client-sdks/stainless/config.yml
+++ b/client-sdks/stainless/config.yml
@ -1,8 +1,6 @@
 # yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json

 organization:
-  # Name of your organization or company, used to determine the name of the client
-  # and headings.
  name: llama-stack-client
  docs: https://llama-stack.readthedocs.io/en/latest/
  contact: llamastack@meta.com
@ -13,8 +11,6 @@ security_schemes:
  BearerAuth:
    type: http
    scheme: bearer
-# `targets` define the output targets and their customization options, such as
-# whether to emit the Node SDK and what it's package name should be.
 targets:
  node:
    package_name: llama-stack-client
@ -40,25 +36,17 @@ targets:
    options:
      enable_v2: true
      back_compat_use_shared_package: false
-
-# `client_settings` define settings for the API client, such as extra constructor
-# arguments (used for authentication), retry behavior, idempotency, etc.
 client_settings:
  default_env_prefix: LLAMA_STACK_CLIENT
  opts:
    api_key:
      type: string
      read_env: LLAMA_STACK_CLIENT_API_KEY
-      auth: { security_scheme: BearerAuth }
+      auth:
+        security_scheme: BearerAuth
      nullable: true
-
-# `environments` are a map of the name of the environment (e.g. "sandbox",
-# "production") to the corresponding url to use.
 environments:
  production: http://any-hosted-llama-stack.com
-
-# `pagination` defines [pagination schemes] which provides a template to match
-# endpoints and generate next-page and auto-pagination helpers in the SDKs.
 pagination:
 - name: datasets_iterrows
  type: offset
@ -99,12 +87,72 @@ pagination:
      type: string
      x-stainless-pagination-property:
        purpose: next_cursor_field
-# `resources` define the structure and organziation for your API, such as how
-# methods and models are grouped together and accessed. See the [configuration
-# guide] for more information.
-#
-# [configuration guide]:
-#   https://app.stainlessapi.com/docs/guides/configure#resources
+settings:
+  license: MIT
+  unwrap_response_fields:
+  - data
+  file_header: 'Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved.
+
+
+    This source code is licensed under the terms described in the LICENSE file in
+
+    the root directory of this source tree.
+
+    '
+openapi:
+  transformations:
+  - command: mergeObject
+    reason: Better return_type using enum
+    args:
+      target:
+      - $.components.schemas
+      object:
+        ReturnType:
+          additionalProperties: false
+          properties:
+            type:
+              enum:
+              - string
+              - number
+              - boolean
+              - array
+              - object
+              - json
+              - union
+              - chat_completion_input
+              - completion_input
+              - agent_turn_input
+          required:
+          - type
+          type: object
+  - command: replaceProperties
+    reason: Replace return type properties with better model (see above)
+    args:
+      filter:
+        only:
+        - $.components.schemas.ScoringFn.properties.return_type
+        - $.components.schemas.RegisterScoringFunctionRequest.properties.return_type
+      value:
+        $ref: '#/components/schemas/ReturnType'
+  - command: oneOfToAnyOf
+    reason: Prism (mock server) doesn't like one of our requests as it technically
+      matches multiple variants
+readme:
+  example_requests:
+    default:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: {}
+    headline:
+      type: request
+      endpoint: get /v1/models
+      params: {}
+    pagination:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: {}
 resources:
  $shared:
    models:
@ -128,19 +176,17 @@ resources:
    methods:
      get: get /v1/tools/{tool_name}
      list:
-        endpoint: get /v1/tools
        paginated: false
-
+        endpoint: get /v1/tools
  tool_runtime:
    models:
      tool_def: ToolDef
      tool_invocation_result: ToolInvocationResult
    methods:
      list_tools:
-        endpoint: get /v1/tool-runtime/list-tools
        paginated: false
+        endpoint: get /v1/tool-runtime/list-tools
      invoke_tool: post /v1/tool-runtime/invoke
-
  responses:
    models:
      response_object_stream: OpenAIResponseObjectStream
@ -148,10 +194,10 @@ resources:
    methods:
      create:
        type: http
-        endpoint: post /v1/responses
        streaming:
          stream_event_model: responses.response_object_stream
          param_discriminator: stream
+        endpoint: post /v1/responses
      retrieve: get /v1/responses/{response_id}
      list:
        type: http
@ -164,9 +210,8 @@ resources:
        methods:
          list:
            type: http
-            endpoint: get /v1/responses/{response_id}/input_items
            paginated: false
-
+            endpoint: get /v1/responses/{response_id}/input_items
  prompts:
    models:
      prompt: Prompt
@ -174,8 +219,8 @@ resources:
    methods:
      create: post /v1/prompts
      list:
-        endpoint: get /v1/prompts
        paginated: false
+        endpoint: get /v1/prompts
      retrieve: get /v1/prompts/{prompt_id}
      update: post /v1/prompts/{prompt_id}
      delete: delete /v1/prompts/{prompt_id}
@ -184,9 +229,8 @@ resources:
      versions:
        methods:
          list:
-            endpoint: get /v1/prompts/{prompt_id}/versions
            paginated: false
-
+            endpoint: get /v1/prompts/{prompt_id}/versions
  conversations:
    models:
      conversation_object: Conversation
@ -216,7 +260,6 @@ resources:
          delete:
            type: http
            endpoint: delete /v1/conversations/{conversation_id}/items/{item_id}
-
  inspect:
    models:
      healthInfo: HealthInfo
@ -226,13 +269,11 @@ resources:
    methods:
      health: get /v1/health
      version: get /v1/version
-
  embeddings:
    models:
      create_embeddings_response: OpenAIEmbeddingsResponse
    methods:
      create: post /v1/embeddings
-
  chat:
    models:
      chat_completion_chunk: OpenAIChatCompletionChunk
@ -241,14 +282,14 @@ resources:
        methods:
          create:
            type: http
-            endpoint: post /v1/chat/completions
            streaming:
              stream_event_model: chat.chat_completion_chunk
              param_discriminator: stream
+            endpoint: post /v1/chat/completions
          list:
            type: http
-            endpoint: get /v1/chat/completions
            paginated: false
+            endpoint: get /v1/chat/completions
          retrieve:
            type: http
            endpoint: get /v1/chat/completions/{completion_id}
@ -256,17 +297,15 @@ resources:
    methods:
      create:
        type: http
-        endpoint: post /v1/completions
        streaming:
          param_discriminator: stream
-
+        endpoint: post /v1/completions
  vector_io:
    models:
      queryChunksResponse: QueryChunksResponse
    methods:
      insert: post /v1/vector-io/insert
      query: post /v1/vector-io/query
-
  vector_stores:
    models:
      vector_store: VectorStoreObject
@ -275,8 +314,7 @@ resources:
      vector_store_search_response: VectorStoreSearchResponsePage
    methods:
      create: post /v1/vector_stores
-      list:
-        endpoint: get /v1/vector_stores
+      list: get /v1/vector_stores
      retrieve: get /v1/vector_stores/{vector_store_id}
      update: post /v1/vector_stores/{vector_store_id}
      delete: delete /v1/vector_stores/{vector_store_id}
@ -301,15 +339,14 @@ resources:
          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
-
  models:
    models:
      model: OpenAIModel
      list_models_response: OpenAIListModelsResponse
    methods:
      list:
-        endpoint: get /v1/models
        paginated: false
+        endpoint: get /v1/models
      retrieve: get /v1/models/{model_id}
      register: post /v1/models
      unregister: delete /v1/models/{model_id}
@ -317,38 +354,33 @@ resources:
      openai:
        methods:
          list:
-            endpoint: get /v1/models
            paginated: false
-
+            endpoint: get /v1/models
  providers:
    models:
      list_providers_response: ListProvidersResponse
    methods:
      list:
-        endpoint: get /v1/providers
        paginated: false
+        endpoint: get /v1/providers
      retrieve: get /v1/providers/{provider_id}
-
  routes:
    models:
      list_routes_response: ListRoutesResponse
    methods:
      list:
-        endpoint: get /v1/inspect/routes
        paginated: false
-
+        endpoint: get /v1/inspect/routes
  moderations:
    models:
      create_response: ModerationObject
    methods:
      create: post /v1/moderations
-
  safety:
    models:
      run_shield_response: RunShieldResponse
    methods:
      run_shield: post /v1/safety/run-shield
-
  shields:
    models:
      shield: Shield
@ -356,53 +388,48 @@ resources:
    methods:
      retrieve: get /v1/shields/{identifier}
      list:
-        endpoint: get /v1/shields
        paginated: false
+        endpoint: get /v1/shields
      register: post /v1/shields
      delete: delete /v1/shields/{identifier}
-
  scoring:
    methods:
      score: post /v1/scoring/score
      score_batch: post /v1/scoring/score-batch
  scoring_functions:
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        endpoint: get /v1/scoring-functions
-        paginated: false
-      register: post /v1/scoring-functions
-      unregister: delete /v1/scoring-functions/{scoring_fn_id}
    models:
      scoring_fn: ScoringFn
      scoring_fn_params: ScoringFnParams
      list_scoring_functions_response: ListScoringFunctionsResponse
-
+    methods:
+      retrieve: get /v1/scoring-functions/{scoring_fn_id}
+      list:
+        paginated: false
+        endpoint: get /v1/scoring-functions
+      register: post /v1/scoring-functions
+      unregister: delete /v1/scoring-functions/{scoring_fn_id}
  files:
+    models:
+      file: OpenAIFileObject
+      list_files_response: ListOpenAIFileResponse
+      delete_file_response: OpenAIFileDeleteResponse
    methods:
      create: post /v1/files
      list: get /v1/files
      retrieve: get /v1/files/{file_id}
      delete: delete /v1/files/{file_id}
      content: get /v1/files/{file_id}/content
-    models:
-      file: OpenAIFileObject
-      list_files_response: ListOpenAIFileResponse
-      delete_file_response: OpenAIFileDeleteResponse
-
  batches:
    methods:
      create: post /v1/batches
      list: get /v1/batches
      retrieve: get /v1/batches/{batch_id}
      cancel: post /v1/batches/{batch_id}/cancel
-
  alpha:
    subresources:
      inference:
        methods:
          rerank: post /v1alpha/inference/rerank
-
      post_training:
        models:
          algorithm_config: AlgorithmConfig
@ -418,39 +445,35 @@ resources:
              cancel: post /v1alpha/post-training/job/cancel
              status: get /v1alpha/post-training/job/status
              list:
+                paginated: false
                endpoint: get /v1alpha/post-training/jobs
-                paginated: false
-
      benchmarks:
-        methods:
-          retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-          list:
-            endpoint: get /v1alpha/eval/benchmarks
-            paginated: false
-          register: post /v1alpha/eval/benchmarks
-          unregister: delete /v1alpha/eval/benchmarks/{benchmark_id}
        models:
          benchmark: Benchmark
          list_benchmarks_response: ListBenchmarksResponse
-
+        methods:
+          retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
+          list:
+            paginated: false
+            endpoint: get /v1alpha/eval/benchmarks
+          register: post /v1alpha/eval/benchmarks
+          unregister: delete /v1alpha/eval/benchmarks/{benchmark_id}
      eval:
+        models:
+          evaluate_response: EvaluateResponse
+          benchmark_config: BenchmarkConfig
+          job: Job
        methods:
          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-
        subresources:
          jobs:
            methods:
              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-
  beta:
    subresources:
      datasets:
@ -460,74 +483,8 @@ resources:
          register: post /v1beta/datasets
          retrieve: get /v1beta/datasets/{dataset_id}
          list:
-            endpoint: get /v1beta/datasets
            paginated: false
+            endpoint: get /v1beta/datasets
          unregister: delete /v1beta/datasets/{dataset_id}
          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-
-settings:
-  license: MIT
-  unwrap_response_fields: [data]
-  file_header: |
-    Copyright (c) Meta Platforms, Inc. and affiliates.
-    All rights reserved.
-
-    This source code is licensed under the terms described in the LICENSE file in
-    the root directory of this source tree.
-
-openapi:
-  transformations:
-    - command: mergeObject
-      reason: Better return_type using enum
-      args:
-        target:
-          - "$.components.schemas"
-        object:
-          ReturnType:
-            additionalProperties: false
-            properties:
-              type:
-                enum:
-                  - string
-                  - number
-                  - boolean
-                  - array
-                  - object
-                  - json
-                  - union
-                  - chat_completion_input
-                  - completion_input
-                  - agent_turn_input
-            required:
-              - type
-            type: object
-    - command: replaceProperties
-      reason: Replace return type properties with better model (see above)
-      args:
-        filter:
-          only:
-            - "$.components.schemas.ScoringFn.properties.return_type"
-            - "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type"
-        value:
-          $ref: "#/components/schemas/ReturnType"
-    - command: oneOfToAnyOf
-      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
-
-# `readme` is used to configure the code snippets that will be rendered in the
-# README.md of various SDKs. In particular, you can change the `headline`
-# snippet's endpoint and the arguments to call it with.
-readme:
-  example_requests:
-    default:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: &ref_0 {}
-    headline:
-      type: request
-      endpoint: get /v1/models
-      params: *ref_0
-    pagination:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -1820,7 +1820,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequestLoose'
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
        required: true
      deprecated: true
  /v1/scoring-functions/{scoring_fn_id}:
@ -3310,7 +3310,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequestLoose'
+              $ref: '#/components/schemas/RegisterDatasetRequest'
        required: true
      deprecated: true
  /v1beta/datasets/{dataset_id}:
@ -3567,7 +3567,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BenchmarkConfig'
+              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
@ -6739,9 +6739,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -7141,6 +7142,11 @@ components:
          anyOf:
          - type: string
          - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -7267,9 +7273,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -9871,9 +9878,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
+            anyOf:
+            - type: string
+              maxLength: 512
+            - type: number
+            - type: boolean
+            title: string | number | boolean
+          propertyNames:
+            type: string
+            maxLength: 64
          type: object
+          maxProperties: 16
          title: Attributes
+          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
+          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
@ -10602,6 +10621,14 @@ components:
      - scores
      title: EvaluateResponse
      description: The response from an evaluation.
+    RunEvalRequest:
+      properties:
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      type: object
+      required:
+      - benchmark_config
+      title: RunEvalRequest
    Job:
      properties:
        job_id:
@ -11185,6 +11212,67 @@ components:
      - $ref: '#/components/schemas/CompletionInputType'
        title: CompletionInputType
      title: StringType | ... (9 variants)
+    RegisterScoringFunctionRequest:
+      properties:
+        scoring_fn_id:
+          type: string
+          title: Scoring Fn Id
+        description:
+          type: string
+          title: Description
+        return_type:
+          anyOf:
+          - $ref: '#/components/schemas/StringType'
+            title: StringType
+          - $ref: '#/components/schemas/NumberType'
+            title: NumberType
+          - $ref: '#/components/schemas/BooleanType'
+            title: BooleanType
+          - $ref: '#/components/schemas/ArrayType'
+            title: ArrayType
+          - $ref: '#/components/schemas/ObjectType'
+            title: ObjectType
+          - $ref: '#/components/schemas/JsonType'
+            title: JsonType
+          - $ref: '#/components/schemas/UnionType'
+            title: UnionType
+          - $ref: '#/components/schemas/ChatCompletionInputType'
+            title: ChatCompletionInputType
+          - $ref: '#/components/schemas/CompletionInputType'
+            title: CompletionInputType
+          title: StringType | ... (9 variants)
+        provider_scoring_fn_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        provider_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        params:
+          anyOf:
+          - oneOf:
+            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+              title: LLMAsJudgeScoringFnParams
+            - $ref: '#/components/schemas/RegexParserScoringFnParams'
+              title: RegexParserScoringFnParams
+            - $ref: '#/components/schemas/BasicScoringFnParams'
+              title: BasicScoringFnParams
+            discriminator:
+              propertyName: type
+              mapping:
+                basic: '#/components/schemas/BasicScoringFnParams'
+                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - type: 'null'
+          title: Params
+      type: object
+      required:
+      - scoring_fn_id
+      - description
+      - return_type
+      title: RegisterScoringFunctionRequest
    RegisterShieldRequest:
      properties:
        shield_id:
@ -11243,6 +11331,31 @@ components:
      - $ref: '#/components/schemas/RowsDataSource'
        title: RowsDataSource
      title: URIDataSource | RowsDataSource
+    RegisterDatasetRequest:
+      properties:
+        purpose:
+          $ref: '#/components/schemas/DatasetPurpose'
+        source:
+          anyOf:
+          - $ref: '#/components/schemas/URIDataSource'
+            title: URIDataSource
+          - $ref: '#/components/schemas/RowsDataSource'
+            title: RowsDataSource
+          title: URIDataSource | RowsDataSource
+        metadata:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - type: 'null'
+        dataset_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+      type: object
+      required:
+      - purpose
+      - source
+      title: RegisterDatasetRequest
    RegisterBenchmarkRequest:
      properties:
        benchmark_id:
@ -11979,41 +12092,6 @@ components:
      required:
      - reasoning_tokens
      title: OutputTokensDetails
-    RegisterDatasetRequestLoose:
-      properties:
-        purpose:
-          title: Purpose
-        source:
-          title: Source
-        metadata:
-          title: Metadata
-        dataset_id:
-          title: Dataset Id
-      type: object
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequestLoose
-    RegisterScoringFunctionRequestLoose:
-      properties:
-        scoring_fn_id:
-          title: Scoring Fn Id
-        description:
-          title: Description
-        return_type:
-          title: Return Type
-        provider_scoring_fn_id:
-          title: Provider Scoring Fn Id
-        provider_id:
-          title: Provider Id
-        params:
-          title: Params
-      type: object
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequestLoose
    SearchRankingOptions:
      properties:
        ranker:
--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@ -104,23 +104,19 @@ client.toolgroups.register(
 )
 ```

-Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
+Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide the authorization token when creating the Agent:

 ```python
 agent = Agent(
    ...,
-    tools=["mcp::deepwiki"],
-    extra_headers={
-        "X-LlamaStack-Provider-Data": json.dumps(
+    tools=[
        {
-                "mcp_headers": {
-                    "http://mcp.deepwiki.com/sse": {
-                        "Authorization": "Bearer <your_access_token>",
-                    },
-                },
+            "type": "mcp",
+            "server_url": "https://mcp.deepwiki.com/sse",
+            "server_label": "mcp::deepwiki",
+            "authorization": "<your_access_token>",  # OAuth token (without "Bearer " prefix)
        }
-        ),
-    },
+    ],
 )
 agent.create_turn(...)
 ```
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Agents
+description: |
+  Agents

-    APIs for creating and interacting with agentic systems."
+      APIs for creating and interacting with agentic systems.
 sidebar_label: Agents
 title: Agents
 ---
--- a/docs/docs/providers/agents/inline_meta-reference.mdx
+++ b/docs/docs/providers/agents/inline_meta-reference.mdx
@ -14,7 +14,7 @@ Meta's reference implementation of an agent system that can use tools, access ve

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No |  |  |
+| `persistence` | `AgentPersistenceConfig` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/batches/index.mdx
+++ b/docs/docs/providers/batches/index.mdx
@ -1,5 +1,6 @@
 ---
-description: "The Batches API enables efficient processing of multiple requests in a single operation,
+description: |
+  The Batches API enables efficient processing of multiple requests in a single operation,
      particularly useful for processing large datasets, batch evaluation workflows, and
      cost-effective inference at scale.

@ -8,7 +9,7 @@ description: "The Batches API enables efficient processing of multiple requests
      This API provides the following extensions:
       - idempotent batch creation

-    Note: This API is currently under active development and may undergo changes."
+      Note: This API is currently under active development and may undergo changes.
 sidebar_label: Batches
 title: Batches
 ---
--- a/docs/docs/providers/batches/inline_reference.mdx
+++ b/docs/docs/providers/batches/inline_reference.mdx
@ -14,9 +14,9 @@ Reference implementation of batches API with KVStore persistence.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Configuration for the key-value store backend. |
-| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
-| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
+| `kvstore` | `KVStoreReference` | No |  | Configuration for the key-value store backend. |
+| `max_concurrent_batches` | `int` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
+| `max_concurrent_requests_per_batch` | `int` | No | 10 | Maximum number of concurrent requests to process per batch. |

 ## Sample Configuration

--- a/docs/docs/providers/datasetio/inline_localfs.mdx
+++ b/docs/docs/providers/datasetio/inline_localfs.mdx
@ -14,7 +14,7 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/datasetio/remote_huggingface.mdx
+++ b/docs/docs/providers/datasetio/remote_huggingface.mdx
@ -14,7 +14,7 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/datasetio/remote_nvidia.mdx
+++ b/docs/docs/providers/datasetio/remote_nvidia.mdx
@ -17,7 +17,7 @@ NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform
 | `api_key` | `str \| None` | No |  | The NVIDIA API key. |
 | `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
 | `project_id` | `str \| None` | No | test-project | The NVIDIA project ID. |
-| `datasets_url` | `<class 'str'>` | No | http://nemo.test | Base URL for the NeMo Dataset API |
+| `datasets_url` | `str` | No | http://nemo.test | Base URL for the NeMo Dataset API |

 ## Sample Configuration

--- a/docs/docs/providers/eval/index.mdx
+++ b/docs/docs/providers/eval/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Evaluations
+description: |
+  Evaluations

-    Llama Stack Evaluation API for running evaluations on model and agent candidates."
+      Llama Stack Evaluation API for running evaluations on model and agent candidates.
 sidebar_label: Eval
 title: Eval
 ---
--- a/docs/docs/providers/eval/inline_meta-reference.mdx
+++ b/docs/docs/providers/eval/inline_meta-reference.mdx
@ -14,7 +14,7 @@ Meta's reference implementation of evaluation tasks with support for multiple la

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `kvstore` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/eval/remote_nvidia.mdx
+++ b/docs/docs/providers/eval/remote_nvidia.mdx
@ -14,7 +14,7 @@ NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
+| `evaluator_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |

 ## Sample Configuration

--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Files
+description: |
+  Files

-    This API is used to upload documents that can be used with other Llama Stack APIs."
+      This API is used to upload documents that can be used with other Llama Stack APIs.
 sidebar_label: Files
 title: Files
 ---
--- a/docs/docs/providers/files/inline_localfs.mdx
+++ b/docs/docs/providers/files/inline_localfs.mdx
@ -14,9 +14,9 @@ Local filesystem-based file storage provider for managing files and documents lo

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `storage_dir` | `<class 'str'>` | No |  | Directory to store uploaded files |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
-| `ttl_secs` | `<class 'int'>` | No | 31536000 |  |
+| `storage_dir` | `str` | No |  | Directory to store uploaded files |
+| `metadata_store` | `SqlStoreReference` | No |  | SQL store configuration for file metadata |
+| `ttl_secs` | `int` | No | 31536000 |  |

 ## Sample Configuration

--- a/docs/docs/providers/files/remote_openai.mdx
+++ b/docs/docs/providers/files/remote_openai.mdx
@ -14,8 +14,8 @@ OpenAI Files API provider for managing files through OpenAI's native file storag

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `api_key` | `<class 'str'>` | No |  | OpenAI API key for authentication |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `api_key` | `str` | No |  | OpenAI API key for authentication |
+| `metadata_store` | `SqlStoreReference` | No |  | SQL store configuration for file metadata |

 ## Sample Configuration

--- a/docs/docs/providers/files/remote_s3.mdx
+++ b/docs/docs/providers/files/remote_s3.mdx
@ -14,13 +14,13 @@ AWS S3-based file storage provider for scalable cloud file management with metad

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `bucket_name` | `<class 'str'>` | No |  | S3 bucket name to store files |
-| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
+| `bucket_name` | `str` | No |  | S3 bucket name to store files |
+| `region` | `str` | No | us-east-1 | AWS region where the bucket is located |
 | `aws_access_key_id` | `str \| None` | No |  | AWS access key ID (optional if using IAM roles) |
 | `aws_secret_access_key` | `str \| None` | No |  | AWS secret access key (optional if using IAM roles) |
 | `endpoint_url` | `str \| None` | No |  | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
-| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
-| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
+| `auto_create_bucket` | `bool` | No | False | Automatically create the S3 bucket if it doesn't exist |
+| `metadata_store` | `SqlStoreReference` | No |  | SQL store configuration for file metadata |

 ## Sample Configuration

--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -1,12 +1,13 @@
 ---
-description: "Inference
+description: |
+  Inference

      Llama Stack Inference API for generating completions, chat completions, and embeddings.

      This API provides the raw interface to the underlying models. Three kinds of models are supported:
-    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
+      - LLM models: these models generate "raw" and "chat" (conversational) completions.
      - Embedding models: these models generate embeddings to be used for semantic search.
-    - Rerank models: these models reorder the documents based on their relevance to a query."
+      - Rerank models: these models reorder the documents based on their relevance to a query.
 sidebar_label: Inference
 title: Inference
 ---
--- a/docs/docs/providers/inference/inline_meta-reference.mdx
+++ b/docs/docs/providers/inference/inline_meta-reference.mdx
@ -16,12 +16,12 @@ Meta's reference implementation of inference with support for various model form
 |-------|------|----------|---------|-------------|
 | `model` | `str \| None` | No |  |  |
 | `torch_seed` | `int \| None` | No |  |  |
-| `max_seq_len` | `<class 'int'>` | No | 4096 |  |
-| `max_batch_size` | `<class 'int'>` | No | 1 |  |
+| `max_seq_len` | `int` | No | 4096 |  |
+| `max_batch_size` | `int` | No | 1 |  |
 | `model_parallel_size` | `int \| None` | No |  |  |
-| `create_distributed_process_group` | `<class 'bool'>` | No | True |  |
+| `create_distributed_process_group` | `bool` | No | True |  |
 | `checkpoint_dir` | `str \| None` | No |  |  |
-| `quantization` | `Bf16QuantizationConfig \| Fp8QuantizationConfig \| Int4QuantizationConfig, annotation=NoneType, required=True, discriminator='type'` | No |  |  |
+| `quantization` | `Bf16QuantizationConfig \| Fp8QuantizationConfig \| Int4QuantizationConfig \| None` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_anthropic.mdx
+++ b/docs/docs/providers/inference/remote_anthropic.mdx
@ -14,9 +14,9 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -21,10 +21,10 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |

@ -32,7 +32,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview

 ```yaml
 api_key: ${env.AZURE_API_KEY:=}
-api_base: ${env.AZURE_API_BASE:=}
+base_url: ${env.AZURE_API_BASE:=}
 api_version: ${env.AZURE_API_VERSION:=}
 api_type: ${env.AZURE_API_TYPE:=}
 ```
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -14,14 +14,14 @@ AWS Bedrock inference provider using OpenAI compatible endpoint.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `region_name` | `<class 'str'>` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `region_name` | `str` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |

 ## Sample Configuration

 ```yaml
-api_key: ${env.AWS_BEDROCK_API_KEY:=}
+api_key: ${env.AWS_BEARER_TOKEN_BEDROCK:=}
 region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
 ```
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -14,14 +14,14 @@ Cerebras inference provider for running models on Cerebras Cloud platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.cerebras.ai/v1 | Base URL for the Cerebras API |

 ## Sample Configuration

 ```yaml
-base_url: https://api.cerebras.ai
+base_url: https://api.cerebras.ai/v1
 api_key: ${env.CEREBRAS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -14,14 +14,14 @@ Databricks inference provider for running models on Databricks' unified analytic

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The Databricks API token |
-| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `SecretStr \| None` | No |  | The Databricks API token |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Databricks model serving endpoint (should include /serving-endpoints path) |

 ## Sample Configuration

 ```yaml
-url: ${env.DATABRICKS_HOST:=}
+base_url: ${env.DATABRICKS_HOST:=}
 api_token: ${env.DATABRICKS_TOKEN:=}
 ```
--- a/docs/docs/providers/inference/remote_fireworks.mdx
+++ b/docs/docs/providers/inference/remote_fireworks.mdx
@ -14,14 +14,14 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |

 ## Sample Configuration

 ```yaml
-url: https://api.fireworks.ai/inference/v1
+base_url: https://api.fireworks.ai/inference/v1
 api_key: ${env.FIREWORKS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_gemini.mdx
+++ b/docs/docs/providers/inference/remote_gemini.mdx
@ -14,9 +14,9 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -14,14 +14,14 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.groq.com/openai/v1 | The URL for the Groq AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.groq.com
+base_url: https://api.groq.com/openai/v1
 api_key: ${env.GROQ_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_hf_endpoint.mdx
+++ b/docs/docs/providers/inference/remote_hf_endpoint.mdx
@ -14,8 +14,8 @@ HuggingFace Inference Endpoints provider for dedicated model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `endpoint_name` | `<class 'str'>` | No |  | The name of the Hugging Face Inference Endpoint in the format of '&#123;namespace&#125;/&#123;endpoint_name&#125;' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |
+| `endpoint_name` | `str` | No |  | The name of the Hugging Face Inference Endpoint in the format of '&#123;namespace&#125;/&#123;endpoint_name&#125;' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
+| `api_token` | `SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_hf_serverless.mdx
+++ b/docs/docs/providers/inference/remote_hf_serverless.mdx
@ -14,8 +14,8 @@ HuggingFace Inference API serverless provider for on-demand model inference.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `huggingface_repo` | `<class 'str'>` | No |  | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |
+| `huggingface_repo` | `str` | No |  | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
+| `api_token` | `SecretStr \| None` | No |  | Your Hugging Face user access token (will default to locally saved token if not provided) |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -14,14 +14,14 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

 ## Sample Configuration

 ```yaml
-openai_compat_api_base: https://api.llama.com/compat/v1/
+base_url: https://api.llama.com/compat/v1/
 api_key: ${env.LLAMA_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -14,18 +14,16 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
-| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
-| `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
-| `rerank_model_to_url` | `dict[str, str` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://integrate.api.nvidia.com/v1 | A base url for accessing the NVIDIA NIM |
+| `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
+| `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |

 ## Sample Configuration

 ```yaml
-url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
 api_key: ${env.NVIDIA_API_KEY:=}
-append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
 ```
--- a/docs/docs/providers/inference/remote_oci.mdx
+++ b/docs/docs/providers/inference/remote_oci.mdx
@ -21,14 +21,14 @@ https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `oci_auth_type` | `<class 'str'>` | No | instance_principal | OCI authentication type (must be one of: instance_principal, config_file) |
-| `oci_region` | `<class 'str'>` | No | us-ashburn-1 | OCI region (e.g., us-ashburn-1) |
-| `oci_compartment_id` | `<class 'str'>` | No |  | OCI compartment ID for the Generative AI service |
-| `oci_config_file_path` | `<class 'str'>` | No | ~/.oci/config | OCI config file path (required if oci_auth_type is config_file) |
-| `oci_config_profile` | `<class 'str'>` | No | DEFAULT | OCI config profile (required if oci_auth_type is config_file) |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `oci_auth_type` | `str` | No | instance_principal | OCI authentication type (must be one of: instance_principal, config_file) |
+| `oci_region` | `str` | No | us-ashburn-1 | OCI region (e.g., us-ashburn-1) |
+| `oci_compartment_id` | `str` | No |  | OCI compartment ID for the Generative AI service |
+| `oci_config_file_path` | `str` | No | ~/.oci/config | OCI config file path (required if oci_auth_type is config_file) |
+| `oci_config_profile` | `str` | No | DEFAULT | OCI config profile (required if oci_auth_type is config_file) |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -14,12 +14,12 @@ Ollama inference provider for running local models through the Ollama runtime.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `<class 'str'>` | No | http://localhost:11434 |  |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `base_url` | `HttpUrl \| None` | No | http://localhost:11434/v1 |  |

 ## Sample Configuration

 ```yaml
-url: ${env.OLLAMA_URL:=http://localhost:11434}
+base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
 ```
--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -14,10 +14,10 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -14,14 +14,14 @@ Passthrough inference provider for connecting to any external inference service

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the passthrough endpoint |

 ## Sample Configuration

 ```yaml
-url: ${env.PASSTHROUGH_URL}
+base_url: ${env.PASSTHROUGH_URL}
 api_key: ${env.PASSTHROUGH_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -14,14 +14,14 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `SecretStr \| None` | No |  | The API token |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Runpod model serving endpoint |

 ## Sample Configuration

 ```yaml
-url: ${env.RUNPOD_URL:=}
+base_url: ${env.RUNPOD_URL:=}
 api_token: ${env.RUNPOD_API_TOKEN}
 ```
--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -14,14 +14,14 @@ SambaNova inference provider for running models on SambaNova's dataflow architec

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.sambanova.ai/v1
+base_url: https://api.sambanova.ai/v1
 api_key: ${env.SAMBANOVA_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -14,12 +14,12 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the TGI serving endpoint (should include /v1 path) |

 ## Sample Configuration

 ```yaml
-url: ${env.TGI_URL:=}
+base_url: ${env.TGI_URL:=}
 ```
--- a/docs/docs/providers/inference/remote_together.mdx
+++ b/docs/docs/providers/inference/remote_together.mdx
@ -14,14 +14,14 @@ Together AI inference provider for open-source models and collaborative AI devel

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://api.together.xyz/v1 | The URL for the Together AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.together.xyz/v1
+base_url: https://api.together.xyz/v1
 api_key: ${env.TOGETHER_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_vertexai.mdx
+++ b/docs/docs/providers/inference/remote_vertexai.mdx
@ -53,10 +53,10 @@ Available Models:

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
-| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `project` | `str` | No |  | Google Cloud project ID for Vertex AI |
+| `location` | `str` | No | us-central1 | Google Cloud location for Vertex AI |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -14,17 +14,17 @@ Remote vLLM inference provider for connecting to vLLM servers.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_token` | `pydantic.types.SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
-| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_token` | `SecretStr \| None` | No |  | The API token |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the vLLM model serving endpoint |
+| `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |

 ## Sample Configuration

 ```yaml
-url: ${env.VLLM_URL:=}
+base_url: ${env.VLLM_URL:=}
 max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
 api_token: ${env.VLLM_API_TOKEN:=fake}
 tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -14,17 +14,17 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
+| `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
+| `base_url` | `HttpUrl \| None` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
 | `project_id` | `str \| None` | No |  | The watsonx.ai project ID |
-| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
+| `timeout` | `int` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration

 ```yaml
-url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
+base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
 api_key: ${env.WATSONX_API_KEY:=}
 project_id: ${env.WATSONX_PROJECT_ID:=}
 ```
--- a/docs/docs/providers/post_training/inline_huggingface-gpu.mdx
+++ b/docs/docs/providers/post_training/inline_huggingface-gpu.mdx
@ -14,23 +14,23 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `device` | `<class 'str'>` | No | cuda |  |
-| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No |  |  |
-| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface |  |
-| `chat_template` | `<class 'str'>` | No | `&lt;|user|&gt;`<br/>`{input}`<br/>`&lt;|assistant|&gt;`<br/>`{output}` |  |
-| `model_specific_config` | `<class 'dict'>` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` |  |
-| `max_seq_length` | `<class 'int'>` | No | 2048 |  |
-| `gradient_checkpointing` | `<class 'bool'>` | No | False |  |
-| `save_total_limit` | `<class 'int'>` | No | 3 |  |
-| `logging_steps` | `<class 'int'>` | No | 10 |  |
-| `warmup_ratio` | `<class 'float'>` | No | 0.1 |  |
-| `weight_decay` | `<class 'float'>` | No | 0.01 |  |
-| `dataloader_num_workers` | `<class 'int'>` | No | 4 |  |
-| `dataloader_pin_memory` | `<class 'bool'>` | No | True |  |
-| `dpo_beta` | `<class 'float'>` | No | 0.1 |  |
-| `use_reference_model` | `<class 'bool'>` | No | True |  |
-| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid |  |
-| `dpo_output_dir` | `<class 'str'>` | No |  |  |
+| `device` | `str` | No | cuda |  |
+| `distributed_backend` | `Literal[fsdp, deepspeed] \| None` | No |  |  |
+| `checkpoint_format` | `Literal[full_state, huggingface] \| None` | No | huggingface |  |
+| `chat_template` | `str` | No | `&lt;|user|&gt;`<br/>`{input}`<br/>`&lt;|assistant|&gt;`<br/>`{output}` |  |
+| `model_specific_config` | `dict` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` |  |
+| `max_seq_length` | `int` | No | 2048 |  |
+| `gradient_checkpointing` | `bool` | No | False |  |
+| `save_total_limit` | `int` | No | 3 |  |
+| `logging_steps` | `int` | No | 10 |  |
+| `warmup_ratio` | `float` | No | 0.1 |  |
+| `weight_decay` | `float` | No | 0.01 |  |
+| `dataloader_num_workers` | `int` | No | 4 |  |
+| `dataloader_pin_memory` | `bool` | No | True |  |
+| `dpo_beta` | `float` | No | 0.1 |  |
+| `use_reference_model` | `bool` | No | True |  |
+| `dpo_loss_type` | `Literal[sigmoid, hinge, ipo, kto_pair]` | No | sigmoid |  |
+| `dpo_output_dir` | `str` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/post_training/inline_torchtune-cpu.mdx
+++ b/docs/docs/providers/post_training/inline_torchtune-cpu.mdx
@ -15,7 +15,7 @@ TorchTune-based post-training provider for fine-tuning and optimizing models usi
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
-| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+| `checkpoint_format` | `Literal[meta, huggingface] \| None` | No | meta |  |

 ## Sample Configuration

--- a/docs/docs/providers/post_training/inline_torchtune-gpu.mdx
+++ b/docs/docs/providers/post_training/inline_torchtune-gpu.mdx
@ -15,7 +15,7 @@ TorchTune-based post-training provider for fine-tuning and optimizing models usi
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `torch_seed` | `int \| None` | No |  |  |
-| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta |  |
+| `checkpoint_format` | `Literal[meta, huggingface] \| None` | No | meta |  |

 ## Sample Configuration

--- a/docs/docs/providers/post_training/remote_nvidia.mdx
+++ b/docs/docs/providers/post_training/remote_nvidia.mdx
@ -18,9 +18,9 @@ NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
 | `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
 | `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
 | `customizer_url` | `str \| None` | No |  | Base URL for the NeMo Customizer API |
-| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
-| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
-| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
+| `timeout` | `int` | No | 300 | Timeout for the NVIDIA Post Training API |
+| `max_retries` | `int` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
+| `output_model_dir` | `str` | No | test-example-model@v1 | Directory to save the output model |

 ## Sample Configuration

--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@ -1,7 +1,8 @@
 ---
-description: "Safety
+description: |
+  Safety

-    OpenAI-compatible Moderations API."
+      OpenAI-compatible Moderations API.
 sidebar_label: Safety
 title: Safety
 ---
--- a/docs/docs/providers/safety/inline_llama-guard.mdx
+++ b/docs/docs/providers/safety/inline_llama-guard.mdx
@ -14,7 +14,7 @@ Llama Guard safety provider for content moderation and safety filtering using Me

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `excluded_categories` | `list[str` | No | [] |  |
+| `excluded_categories` | `list[str]` | No | [] |  |

 ## Sample Configuration

--- a/docs/docs/providers/safety/inline_prompt-guard.mdx
+++ b/docs/docs/providers/safety/inline_prompt-guard.mdx
@ -14,7 +14,7 @@ Prompt Guard safety provider for detecting and filtering unsafe prompts and cont

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `guard_type` | `<class 'str'>` | No | injection |  |
+| `guard_type` | `str` | No | injection |  |

 ## Sample Configuration

--- a/docs/docs/providers/safety/remote_bedrock.mdx
+++ b/docs/docs/providers/safety/remote_bedrock.mdx
@ -14,8 +14,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
+| `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/safety/remote_nvidia.mdx
+++ b/docs/docs/providers/safety/remote_nvidia.mdx
@ -14,7 +14,7 @@ NVIDIA's safety provider for content moderation and safety filtering.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `guardrails_service_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the Guardrails service |
+| `guardrails_service_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the Guardrails service |
 | `config_id` | `str \| None` | No | self-check | Guardrails configuration ID to use from the Guardrails configuration store |

 ## Sample Configuration
--- a/docs/docs/providers/safety/remote_sambanova.mdx
+++ b/docs/docs/providers/safety/remote_sambanova.mdx
@ -14,8 +14,8 @@ SambaNova's safety provider for content moderation and safety filtering.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The SambaNova cloud API Key |
+| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
+| `api_key` | `SecretStr \| None` | No |  | The SambaNova cloud API Key |

 ## Sample Configuration

--- a/docs/docs/providers/tool_runtime/remote_bing-search.mdx
+++ b/docs/docs/providers/tool_runtime/remote_bing-search.mdx
@ -15,7 +15,7 @@ Bing Search tool for web search capabilities using Microsoft's search engine.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  |  |
-| `top_k` | `<class 'int'>` | No | 3 |  |
+| `top_k` | `int` | No | 3 |  |

 ## Sample Configuration

--- a/docs/docs/providers/tool_runtime/remote_brave-search.mdx
+++ b/docs/docs/providers/tool_runtime/remote_brave-search.mdx
@ -15,7 +15,7 @@ Brave Search tool for web search capabilities with privacy-focused results.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Brave Search API Key |
-| `max_results` | `<class 'int'>` | No | 3 | The maximum number of results to return |
+| `max_results` | `int` | No | 3 | The maximum number of results to return |

 ## Sample Configuration

--- a/docs/docs/providers/tool_runtime/remote_tavily-search.mdx
+++ b/docs/docs/providers/tool_runtime/remote_tavily-search.mdx
@ -15,7 +15,7 @@ Tavily Search tool for AI-optimized web search with structured results.
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `str \| None` | No |  | The Tavily Search API Key |
-| `max_results` | `<class 'int'>` | No | 3 | The maximum number of results to return |
+| `max_results` | `int` | No | 3 | The maximum number of results to return |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_chromadb.mdx
+++ b/docs/docs/providers/vector_io/inline_chromadb.mdx
@ -78,8 +78,8 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `db_path` | `str` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_faiss.mdx
+++ b/docs/docs/providers/vector_io/inline_faiss.mdx
@ -95,7 +95,7 @@ more details about Faiss in general.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_meta-reference.mdx
+++ b/docs/docs/providers/vector_io/inline_meta-reference.mdx
@ -14,7 +14,7 @@ Meta's reference implementation of a vector database.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_milvus.mdx
+++ b/docs/docs/providers/vector_io/inline_milvus.mdx
@ -16,9 +16,9 @@ Please refer to the remote provider documentation.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
-| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
+| `db_path` | `str` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend (SQLite only for now) |
+| `consistency_level` | `str` | No | Strong | The consistency level of the Milvus server |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_qdrant.mdx
+++ b/docs/docs/providers/vector_io/inline_qdrant.mdx
@ -97,8 +97,8 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `path` | `<class 'str'>` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `path` | `str` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -407,8 +407,8 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `db_path` | `str` | No |  | Path to the SQLite database file |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite_vec.mdx
@ -16,8 +16,8 @@ Please refer to the sqlite-vec provider documentation.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `db_path` | `<class 'str'>` | No |  | Path to the SQLite database file |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend (SQLite only for now) |
+| `db_path` | `str` | No |  | Path to the SQLite database file |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/remote_chromadb.mdx
+++ b/docs/docs/providers/vector_io/remote_chromadb.mdx
@ -78,7 +78,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `str \| None` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/remote_milvus.mdx
+++ b/docs/docs/providers/vector_io/remote_milvus.mdx
@ -405,10 +405,10 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `uri` | `<class 'str'>` | No |  | The URI of the Milvus server |
+| `uri` | `str` | No |  | The URI of the Milvus server |
 | `token` | `str \| None` | No |  | The token of the Milvus server |
-| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  | Config for KV store backend |
+| `consistency_level` | `str` | No | Strong | The consistency level of the Milvus server |
+| `persistence` | `KVStoreReference` | No |  | Config for KV store backend |
 | `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

 :::note
--- a/docs/docs/providers/vector_io/remote_pgvector.mdx
+++ b/docs/docs/providers/vector_io/remote_pgvector.mdx
@ -218,7 +218,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
 | `db` | `str \| None` | No | postgres |  |
 | `user` | `str \| None` | No | postgres |  |
 | `password` | `str \| None` | No | mysecretpassword |  |
-| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
+| `persistence` | `KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/remote_qdrant.mdx
+++ b/docs/docs/providers/vector_io/remote_qdrant.mdx
@ -19,14 +19,14 @@ Please refer to the inline provider documentation.
 | `location` | `str \| None` | No |  |  |
 | `url` | `str \| None` | No |  |  |
 | `port` | `int \| None` | No | 6333 |  |
-| `grpc_port` | `<class 'int'>` | No | 6334 |  |
-| `prefer_grpc` | `<class 'bool'>` | No | False |  |
+| `grpc_port` | `int` | No | 6334 |  |
+| `prefer_grpc` | `bool` | No | False |  |
 | `https` | `bool \| None` | No |  |  |
 | `api_key` | `str \| None` | No |  |  |
 | `prefix` | `str \| None` | No |  |  |
 | `timeout` | `int \| None` | No |  |  |
 | `host` | `str \| None` | No |  |  |
-| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No |  |  |
+| `persistence` | `KVStoreReference` | No |  |  |

 ## Sample Configuration

--- a/docs/docs/providers/vector_io/remote_weaviate.mdx
+++ b/docs/docs/providers/vector_io/remote_weaviate.mdx
@ -75,7 +75,7 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
 |-------|------|----------|---------|-------------|
 | `weaviate_api_key` | `str \| None` | No |  | The API key for the Weaviate instance |
 | `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
-| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |
+| `persistence` | `KVStoreReference \| None` | No |  | Config for KV store backend (SQLite only for now) |

 ## Sample Configuration

--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@ -10712,12 +10712,6 @@
      "integrity": "sha512-QMUezzXWII9EV5aTFXW1UBVUO77wYPpjqIF8/AviUCThNeSYZykpoTixUeaNNBwmCev0AMDWMAni+f8Hxb1IFw==",
      "license": "Unlicense"
    },
-    "node_modules/fs.realpath": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
-      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
-      "license": "ISC"
-    },
    "node_modules/fsevents": {
      "version": "2.3.3",
      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@ -10821,21 +10815,20 @@
      "license": "ISC"
    },
    "node_modules/glob": {
-      "version": "7.2.3",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
-      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
-      "deprecated": "Glob versions prior to v9 are no longer supported",
+      "version": "10.5.0",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz",
+      "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==",
      "license": "ISC",
      "dependencies": {
-        "fs.realpath": "^1.0.0",
-        "inflight": "^1.0.4",
-        "inherits": "2",
-        "minimatch": "^3.1.1",
-        "once": "^1.3.0",
-        "path-is-absolute": "^1.0.0"
+        "foreground-child": "^3.1.0",
+        "jackspeak": "^3.1.2",
+        "minimatch": "^9.0.4",
+        "minipass": "^7.1.2",
+        "package-json-from-dist": "^1.0.0",
+        "path-scurry": "^1.11.1"
      },
-      "engines": {
-        "node": "*"
+      "bin": {
+        "glob": "dist/esm/bin.mjs"
      },
      "funding": {
        "url": "https://github.com/sponsors/isaacs"
@ -10859,26 +10852,19 @@
      "integrity": "sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==",
      "license": "BSD-2-Clause"
    },
-    "node_modules/glob/node_modules/brace-expansion": {
-      "version": "1.1.12",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
-      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
-      "license": "MIT",
-      "dependencies": {
-        "balanced-match": "^1.0.0",
-        "concat-map": "0.0.1"
-      }
-    },
    "node_modules/glob/node_modules/minimatch": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "version": "9.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
      "license": "ISC",
      "dependencies": {
-        "brace-expansion": "^1.1.7"
+        "brace-expansion": "^2.0.1"
      },
      "engines": {
-        "node": "*"
+        "node": ">=16 || 14 >=14.17"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
      }
    },
    "node_modules/global-dirs": {
@ -11792,17 +11778,6 @@
        "node": ">=12"
      }
    },
-    "node_modules/inflight": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
-      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
-      "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.",
-      "license": "ISC",
-      "dependencies": {
-        "once": "^1.3.0",
-        "wrappy": "1"
-      }
-    },
    "node_modules/inherits": {
      "version": "2.0.4",
      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
@ -15570,15 +15545,6 @@
        "node": ">= 0.8"
      }
    },
-    "node_modules/once": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
-      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
-      "license": "ISC",
-      "dependencies": {
-        "wrappy": "1"
-      }
-    },
    "node_modules/onetime": {
      "version": "5.1.2",
      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
@ -15955,15 +15921,6 @@
        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
      }
    },
-    "node_modules/path-is-absolute": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
-      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
    "node_modules/path-is-inside": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/path-is-inside/-/path-is-inside-1.0.2.tgz",
@ -20038,41 +19995,6 @@
        "node": ">= 6"
      }
    },
-    "node_modules/sucrase/node_modules/glob": {
-      "version": "10.4.5",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz",
-      "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==",
-      "license": "ISC",
-      "dependencies": {
-        "foreground-child": "^3.1.0",
-        "jackspeak": "^3.1.2",
-        "minimatch": "^9.0.4",
-        "minipass": "^7.1.2",
-        "package-json-from-dist": "^1.0.0",
-        "path-scurry": "^1.11.1"
-      },
-      "bin": {
-        "glob": "dist/esm/bin.mjs"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/sucrase/node_modules/minimatch": {
-      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
-      "license": "ISC",
-      "dependencies": {
-        "brace-expansion": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
    "node_modules/supports-color": {
      "version": "7.2.0",
      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
@ -21620,12 +21542,6 @@
        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
      }
    },
-    "node_modules/wrappy": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
-      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
-      "license": "ISC"
-    },
    "node_modules/write-file-atomic": {
      "version": "3.0.3",
      "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-3.0.3.tgz",
--- a/docs/package.json
+++ b/docs/package.json
@ -31,6 +31,9 @@
    "react-dom": "^19.0.0",
    "remark-code-import": "^1.2.0"
  },
+  "overrides": {
+    "glob": "^10.5.0"
+  },
  "browserslist": {
    "production": [
      ">0.5%",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -193,7 +193,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequestLoose'
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
        required: true
      deprecated: true
  /v1/scoring-functions/{scoring_fn_id}:
@ -549,7 +549,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequestLoose'
+              $ref: '#/components/schemas/RegisterDatasetRequest'
        required: true
      deprecated: true
  /v1beta/datasets/{dataset_id}:
@ -3572,9 +3572,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -3974,6 +3975,11 @@ components:
          anyOf:
          - type: string
          - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -4100,9 +4106,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -6704,9 +6711,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
+            anyOf:
+            - type: string
+              maxLength: 512
+            - type: number
+            - type: boolean
+            title: string | number | boolean
+          propertyNames:
+            type: string
+            maxLength: 64
          type: object
+          maxProperties: 16
          title: Attributes
+          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
+          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
@ -7435,6 +7454,14 @@ components:
      - scores
      title: EvaluateResponse
      description: The response from an evaluation.
+    RunEvalRequest:
+      properties:
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      type: object
+      required:
+      - benchmark_config
+      title: RunEvalRequest
    Job:
      properties:
        job_id:
@ -8018,6 +8045,67 @@ components:
      - $ref: '#/components/schemas/CompletionInputType'
        title: CompletionInputType
      title: StringType | ... (9 variants)
+    RegisterScoringFunctionRequest:
+      properties:
+        scoring_fn_id:
+          type: string
+          title: Scoring Fn Id
+        description:
+          type: string
+          title: Description
+        return_type:
+          anyOf:
+          - $ref: '#/components/schemas/StringType'
+            title: StringType
+          - $ref: '#/components/schemas/NumberType'
+            title: NumberType
+          - $ref: '#/components/schemas/BooleanType'
+            title: BooleanType
+          - $ref: '#/components/schemas/ArrayType'
+            title: ArrayType
+          - $ref: '#/components/schemas/ObjectType'
+            title: ObjectType
+          - $ref: '#/components/schemas/JsonType'
+            title: JsonType
+          - $ref: '#/components/schemas/UnionType'
+            title: UnionType
+          - $ref: '#/components/schemas/ChatCompletionInputType'
+            title: ChatCompletionInputType
+          - $ref: '#/components/schemas/CompletionInputType'
+            title: CompletionInputType
+          title: StringType | ... (9 variants)
+        provider_scoring_fn_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        provider_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        params:
+          anyOf:
+          - oneOf:
+            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+              title: LLMAsJudgeScoringFnParams
+            - $ref: '#/components/schemas/RegexParserScoringFnParams'
+              title: RegexParserScoringFnParams
+            - $ref: '#/components/schemas/BasicScoringFnParams'
+              title: BasicScoringFnParams
+            discriminator:
+              propertyName: type
+              mapping:
+                basic: '#/components/schemas/BasicScoringFnParams'
+                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - type: 'null'
+          title: Params
+      type: object
+      required:
+      - scoring_fn_id
+      - description
+      - return_type
+      title: RegisterScoringFunctionRequest
    RegisterShieldRequest:
      properties:
        shield_id:
@ -8076,6 +8164,31 @@ components:
      - $ref: '#/components/schemas/RowsDataSource'
        title: RowsDataSource
      title: URIDataSource | RowsDataSource
+    RegisterDatasetRequest:
+      properties:
+        purpose:
+          $ref: '#/components/schemas/DatasetPurpose'
+        source:
+          anyOf:
+          - $ref: '#/components/schemas/URIDataSource'
+            title: URIDataSource
+          - $ref: '#/components/schemas/RowsDataSource'
+            title: RowsDataSource
+          title: URIDataSource | RowsDataSource
+        metadata:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - type: 'null'
+        dataset_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+      type: object
+      required:
+      - purpose
+      - source
+      title: RegisterDatasetRequest
    RegisterBenchmarkRequest:
      properties:
        benchmark_id:
@ -8812,41 +8925,6 @@ components:
      required:
      - reasoning_tokens
      title: OutputTokensDetails
-    RegisterDatasetRequestLoose:
-      properties:
-        purpose:
-          title: Purpose
-        source:
-          title: Source
-        metadata:
-          title: Metadata
-        dataset_id:
-          title: Dataset Id
-      type: object
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequestLoose
-    RegisterScoringFunctionRequestLoose:
-      properties:
-        scoring_fn_id:
-          title: Scoring Fn Id
-        description:
-          title: Description
-        return_type:
-          title: Return Type
-        provider_scoring_fn_id:
-          title: Provider Scoring Fn Id
-        provider_id:
-          title: Provider Id
-        params:
-          title: Params
-      type: object
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequestLoose
    SearchRankingOptions:
      properties:
        ranker:
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -300,7 +300,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BenchmarkConfig'
+              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
@ -3297,9 +3297,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -3696,9 +3697,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -6093,9 +6095,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
+            anyOf:
+            - type: string
+              maxLength: 512
+            - type: number
+            - type: boolean
+            title: string | number | boolean
+          propertyNames:
+            type: string
+            maxLength: 64
          type: object
+          maxProperties: 16
          title: Attributes
+          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
+          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
@ -6745,6 +6759,14 @@ components:
      - scores
      title: EvaluateResponse
      description: The response from an evaluation.
+    RunEvalRequest:
+      properties:
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      type: object
+      required:
+      - benchmark_config
+      title: RunEvalRequest
    Job:
      properties:
        job_id:
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -5760,9 +5760,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -6162,6 +6163,11 @@ components:
          anyOf:
          - type: string
          - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -6288,9 +6294,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -8892,9 +8899,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
+            anyOf:
+            - type: string
+              maxLength: 512
+            - type: number
+            - type: boolean
+            title: string | number | boolean
+          propertyNames:
+            type: string
+            maxLength: 64
          type: object
+          maxProperties: 16
          title: Attributes
+          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
+          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -1820,7 +1820,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequestLoose'
+              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
        required: true
      deprecated: true
  /v1/scoring-functions/{scoring_fn_id}:
@ -3310,7 +3310,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequestLoose'
+              $ref: '#/components/schemas/RegisterDatasetRequest'
        required: true
      deprecated: true
  /v1beta/datasets/{dataset_id}:
@ -3567,7 +3567,7 @@ paths:
        content:
          application/json:
            schema:
-              $ref: '#/components/schemas/BenchmarkConfig'
+              $ref: '#/components/schemas/RunEvalRequest'
        required: true
  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
    get:
@ -6739,9 +6739,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -7141,6 +7142,11 @@ components:
          anyOf:
          - type: string
          - type: 'null'
+        parallel_tool_calls:
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -7267,9 +7273,10 @@ components:
          type: array
          title: Output
        parallel_tool_calls:
-          type: boolean
-          title: Parallel Tool Calls
-          default: false
+          anyOf:
+          - type: boolean
+          - type: 'null'
+          default: true
        previous_response_id:
          anyOf:
          - type: string
@ -9871,9 +9878,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
+            anyOf:
+            - type: string
+              maxLength: 512
+            - type: number
+            - type: boolean
+            title: string | number | boolean
+          propertyNames:
+            type: string
+            maxLength: 64
          type: object
+          maxProperties: 16
          title: Attributes
+          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
+          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
@ -10602,6 +10621,14 @@ components:
      - scores
      title: EvaluateResponse
      description: The response from an evaluation.
+    RunEvalRequest:
+      properties:
+        benchmark_config:
+          $ref: '#/components/schemas/BenchmarkConfig'
+      type: object
+      required:
+      - benchmark_config
+      title: RunEvalRequest
    Job:
      properties:
        job_id:
@ -11185,6 +11212,67 @@ components:
      - $ref: '#/components/schemas/CompletionInputType'
        title: CompletionInputType
      title: StringType | ... (9 variants)
+    RegisterScoringFunctionRequest:
+      properties:
+        scoring_fn_id:
+          type: string
+          title: Scoring Fn Id
+        description:
+          type: string
+          title: Description
+        return_type:
+          anyOf:
+          - $ref: '#/components/schemas/StringType'
+            title: StringType
+          - $ref: '#/components/schemas/NumberType'
+            title: NumberType
+          - $ref: '#/components/schemas/BooleanType'
+            title: BooleanType
+          - $ref: '#/components/schemas/ArrayType'
+            title: ArrayType
+          - $ref: '#/components/schemas/ObjectType'
+            title: ObjectType
+          - $ref: '#/components/schemas/JsonType'
+            title: JsonType
+          - $ref: '#/components/schemas/UnionType'
+            title: UnionType
+          - $ref: '#/components/schemas/ChatCompletionInputType'
+            title: ChatCompletionInputType
+          - $ref: '#/components/schemas/CompletionInputType'
+            title: CompletionInputType
+          title: StringType | ... (9 variants)
+        provider_scoring_fn_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        provider_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+        params:
+          anyOf:
+          - oneOf:
+            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
+              title: LLMAsJudgeScoringFnParams
+            - $ref: '#/components/schemas/RegexParserScoringFnParams'
+              title: RegexParserScoringFnParams
+            - $ref: '#/components/schemas/BasicScoringFnParams'
+              title: BasicScoringFnParams
+            discriminator:
+              propertyName: type
+              mapping:
+                basic: '#/components/schemas/BasicScoringFnParams'
+                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
+                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
+            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - type: 'null'
+          title: Params
+      type: object
+      required:
+      - scoring_fn_id
+      - description
+      - return_type
+      title: RegisterScoringFunctionRequest
    RegisterShieldRequest:
      properties:
        shield_id:
@ -11243,6 +11331,31 @@ components:
      - $ref: '#/components/schemas/RowsDataSource'
        title: RowsDataSource
      title: URIDataSource | RowsDataSource
+    RegisterDatasetRequest:
+      properties:
+        purpose:
+          $ref: '#/components/schemas/DatasetPurpose'
+        source:
+          anyOf:
+          - $ref: '#/components/schemas/URIDataSource'
+            title: URIDataSource
+          - $ref: '#/components/schemas/RowsDataSource'
+            title: RowsDataSource
+          title: URIDataSource | RowsDataSource
+        metadata:
+          anyOf:
+          - additionalProperties: true
+            type: object
+          - type: 'null'
+        dataset_id:
+          anyOf:
+          - type: string
+          - type: 'null'
+      type: object
+      required:
+      - purpose
+      - source
+      title: RegisterDatasetRequest
    RegisterBenchmarkRequest:
      properties:
        benchmark_id:
@ -11979,41 +12092,6 @@ components:
      required:
      - reasoning_tokens
      title: OutputTokensDetails
-    RegisterDatasetRequestLoose:
-      properties:
-        purpose:
-          title: Purpose
-        source:
-          title: Source
-        metadata:
-          title: Metadata
-        dataset_id:
-          title: Dataset Id
-      type: object
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequestLoose
-    RegisterScoringFunctionRequestLoose:
-      properties:
-        scoring_fn_id:
-          title: Scoring Fn Id
-        description:
-          title: Description
-        return_type:
-          title: Return Type
-        provider_scoring_fn_id:
-          title: Provider Scoring Fn Id
-        provider_id:
-          title: Provider Id
-        params:
-          title: Params
-      type: object
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequestLoose
    SearchRankingOptions:
      properties:
        ranker:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -38,7 +38,6 @@ dependencies = [
    "pyjwt[crypto]>=2.10.0",                          # Pull crypto to support RS256 for jwt. Requires 2.10.0+ for ssl_context support.
    "pydantic>=2.11.9",
    "rich",
-    "starlette",
    "termcolor",
    "tiktoken",
    "pillow",
@ -50,7 +49,6 @@ dependencies = [
    "aiosqlite>=0.21.0",                              # server - for metadata store
    "asyncpg",                                        # for metadata store
    "sqlalchemy[asyncio]>=2.0.41",                    # server - for conversations
-    "pyyaml>=6.0.2",
    "starlette>=0.49.1",
 ]

@ -358,6 +356,10 @@ exclude = [
 module = [
    "yaml",
    "fire",
+    "redis.asyncio",
+    "psycopg2",
+    "psycopg2.extras",
+    "psycopg2.extensions",
    "torchtune.*",
    "fairscale.*",
    "torchvision.*",
--- a/scripts/docker.sh
+++ b/scripts/docker.sh
@ -287,9 +287,9 @@ start_container() {
    # On macOS/Windows, use host.docker.internal to reach host from container
    # On Linux with --network host, use localhost
    if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then
-        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434/v1}"
    else
-        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434/v1}"
    fi
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"

--- a/scripts/get_setup_env.py
+++ b/scripts/get_setup_env.py
@ -16,16 +16,16 @@ import sys
 from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS


-def get_setup_env_vars(setup_name, suite_name=None):
+def get_setup_config(setup_name, suite_name=None):
    """
-    Get environment variables for a setup, with optional suite default fallback.
+    Get full configuration (env vars + defaults) for a setup.

    Args:
        setup_name: Name of the setup (e.g., 'ollama', 'gpt')
        suite_name: Optional suite name to get default setup if setup_name is None

    Returns:
-        Dictionary of environment variables
+        Dictionary with 'env' and 'defaults' keys
    """
    # If no setup specified, try to get default from suite
    if not setup_name and suite_name:
@ -34,7 +34,7 @@ def get_setup_env_vars(setup_name, suite_name=None):
            setup_name = suite.default_setup

    if not setup_name:
-        return {}
+        return {"env": {}, "defaults": {}}

    setup = SETUP_DEFINITIONS.get(setup_name)
    if not setup:
@ -44,27 +44,31 @@ def get_setup_env_vars(setup_name, suite_name=None):
        )
        sys.exit(1)

-    return setup.env
+    return {"env": setup.env, "defaults": setup.defaults}


 def main():
-    parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
+    parser = argparse.ArgumentParser(description="Extract environment variables and defaults from a test setup")
    parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
    parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
    parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")

    args = parser.parse_args()

-    env_vars = get_setup_env_vars(args.setup, args.suite)
+    config = get_setup_config(args.setup, args.suite)

    if args.format == "bash":
-        # Output as bash export statements
-        for key, value in env_vars.items():
+        # Output env vars as bash export statements
+        for key, value in config["env"].items():
            print(f"export {key}='{value}'")
+        # Output defaults as bash export statements with LLAMA_STACK_TEST_ prefix
+        for key, value in config["defaults"].items():
+            env_key = f"LLAMA_STACK_TEST_{key.upper()}"
+            print(f"export {env_key}='{value}'")
    elif args.format == "json":
        import json

-        print(json.dumps(env_vars))
+        print(json.dumps(config))


 if __name__ == "__main__":
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -640,7 +640,7 @@ cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
      --network llama-net \
      -p "${PORT}:${PORT}" \
      "${server_env_opts[@]}" \
-      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \
+      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}/v1" \
      "${SERVER_IMAGE}" --port "${PORT}")

 log "🦙 Starting Llama Stack..."
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -20,6 +20,7 @@ TEST_PATTERN=""
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
 COLLECT_ONLY=false
+TYPESCRIPT_ONLY=false

 # Function to display usage
 usage() {
@ -34,6 +35,7 @@ Options:
    --subdirs STRING         Comma-separated list of test subdirectories to run (overrides suite)
    --pattern STRING         Regex pattern to pass to pytest -k
    --collect-only           Collect tests only without running them (skips server startup)
+    --typescript-only        Skip Python tests and run only TypeScript client tests
    --help                   Show this help message

 Suites are defined in tests/integration/suites.py and define which tests to run.
@ -90,6 +92,10 @@ while [[ $# -gt 0 ]]; do
        COLLECT_ONLY=true
        shift
        ;;
+    --typescript-only)
+        TYPESCRIPT_ONLY=true
+        shift
+        ;;
    --help)
        usage
        exit 0
@ -181,6 +187,10 @@ echo "$SETUP_ENV"
 eval "$SETUP_ENV"
 echo ""

+# Export suite and setup names for TypeScript tests
+export LLAMA_STACK_TEST_SUITE="$TEST_SUITE"
+export LLAMA_STACK_TEST_SETUP="$TEST_SETUP"
+
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR

@ -212,6 +222,71 @@ find_available_port() {
    return 1
 }

+run_client_ts_tests() {
+    if ! command -v npm &>/dev/null; then
+        echo "npm could not be found; ensure Node.js is installed"
+        return 1
+    fi
+
+    pushd tests/integration/client-typescript >/dev/null
+
+    # Determine if TS_CLIENT_PATH is a directory path or an npm version
+    if [[ -d "$TS_CLIENT_PATH" ]]; then
+        # It's a directory path - use local checkout
+        if [[ ! -f "$TS_CLIENT_PATH/package.json" ]]; then
+            echo "Error: $TS_CLIENT_PATH exists but doesn't look like llama-stack-client-typescript (no package.json)"
+            popd >/dev/null
+            return 1
+        fi
+        echo "Using local llama-stack-client-typescript from: $TS_CLIENT_PATH"
+
+        # Build the TypeScript client first
+        echo "Building TypeScript client..."
+        pushd "$TS_CLIENT_PATH" >/dev/null
+        npm install --silent
+        npm run build --silent
+        popd >/dev/null
+
+        # Install other dependencies first
+        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
+            npm ci --silent
+        else
+            npm install --silent
+        fi
+
+        # Then install the client from local directory
+        echo "Installing llama-stack-client from: $TS_CLIENT_PATH"
+        npm install "$TS_CLIENT_PATH" --silent
+    else
+        # It's an npm version specifier - install from npm
+        echo "Installing llama-stack-client@${TS_CLIENT_PATH} from npm"
+        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
+            npm ci --silent
+            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
+        else
+            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
+        fi
+    fi
+
+    # Verify installation
+    echo "Verifying llama-stack-client installation..."
+    if npm list llama-stack-client 2>/dev/null | grep -q llama-stack-client; then
+        echo "✅ llama-stack-client successfully installed"
+        npm list llama-stack-client
+    else
+        echo "❌ llama-stack-client not found in node_modules"
+        echo "Installed packages:"
+        npm list --depth=0
+        popd >/dev/null
+        return 1
+    fi
+
+    echo "Running TypeScript tests for suite $TEST_SUITE (setup $TEST_SETUP)"
+    npm test
+
+    popd >/dev/null
+}
+
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
    # Find an available port for the server
@ -221,6 +296,7 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
+    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"

    stop_server() {
@ -298,6 +374,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
+    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"

    echo "=== Building Docker Image for distribution: $DISTRO ==="
@ -473,6 +550,8 @@ if [[ -n "$STACK_CONFIG" ]]; then
    STACK_CONFIG_ARG="--stack-config=$STACK_CONFIG"
 fi

+# Run Python tests unless typescript-only mode
+if [[ "$TYPESCRIPT_ONLY" == "false" ]]; then
    pytest -s -v $PYTEST_TARGET \
        $STACK_CONFIG_ARG \
        --inference-mode="$INFERENCE_MODE" \
@ -483,6 +562,11 @@ pytest -s -v $PYTEST_TARGET \
        --color=yes $EXTRA_PARAMS \
        --capture=tee-sys
    exit_code=$?
+else
+    echo "Skipping Python tests (--typescript-only mode)"
+    exit_code=0
+fi
+
 set +x
 set -e

@ -506,5 +590,10 @@ else
    exit 1
 fi

+# Run TypeScript client tests if TS_CLIENT_PATH is set
+if [[ $exit_code -eq 0 && -n "${TS_CLIENT_PATH:-}" && "${LLAMA_STACK_TEST_STACK_CONFIG_TYPE:-}" == "server" ]]; then
+    run_client_ts_tests
+fi
+
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/scripts/openapi_generator/init.py
+++ b/scripts/openapi_generator/init.py
@ -11,6 +11,13 @@ This module provides functionality to generate OpenAPI specifications
 from FastAPI applications.
 """

-from .main import generate_openapi_spec, main
-
 __all__ = ["generate_openapi_spec", "main"]
+
+
+def __getattr__(name: str):
+    if name in {"generate_openapi_spec", "main"}:
+        from .main import generate_openapi_spec as _gos
+        from .main import main as _main
+
+        return {"generate_openapi_spec": _gos, "main": _main}[name]
+    raise AttributeError(name)
--- a/scripts/openapi_generator/endpoints.py
+++ b/scripts/openapi_generator/endpoints.py
@ -15,6 +15,7 @@ import typing
 from typing import Annotated, Any, get_args, get_origin

 from fastapi import FastAPI
+from fastapi.params import Body as FastAPIBody
 from pydantic import Field, create_model

 from llama_stack.log import get_logger
@ -26,6 +27,8 @@ from .state import _extra_body_fields, register_dynamic_model

 logger = get_logger(name=__name__, category="core")

+type QueryParameter = tuple[str, type, Any, bool]
+

 def _to_pascal_case(segment: str) -> str:
    tokens = re.findall(r"[A-Za-z]+|\d+", segment)
@ -75,12 +78,12 @@ def _create_endpoint_with_request_model(
    return endpoint


-def _build_field_definitions(query_parameters: list[tuple[str, type, Any]], use_any: bool = False) -> dict[str, tuple]:
+def _build_field_definitions(query_parameters: list[QueryParameter], use_any: bool = False) -> dict[str, tuple]:
    """Build field definitions for a Pydantic model from query parameters."""
    from typing import Any

    field_definitions = {}
-    for param_name, param_type, default_value in query_parameters:
+    for param_name, param_type, default_value, _ in query_parameters:
        if use_any:
            field_definitions[param_name] = (Any, ... if default_value is inspect.Parameter.empty else default_value)
            continue
@ -108,10 +111,10 @@ def _build_field_definitions(query_parameters: list[tuple[str, type, Any]], use_
            field_definitions[param_name] = (Any, ... if default_value is inspect.Parameter.empty else default_value)

    # Ensure all parameters are included
-    expected_params = {name for name, _, _ in query_parameters}
+    expected_params = {name for name, _, _, _ in query_parameters}
    missing = expected_params - set(field_definitions.keys())
    if missing:
-        for param_name, _, default_value in query_parameters:
+        for param_name, _, default_value, _ in query_parameters:
            if param_name in missing:
                field_definitions[param_name] = (
                    Any,
@ -126,7 +129,7 @@ def _create_dynamic_request_model(
    webmethod,
    method_name: str,
    http_method: str,
-    query_parameters: list[tuple[str, type, Any]],
+    query_parameters: list[QueryParameter],
    use_any: bool = False,
    variant_suffix: str | None = None,
 ) -> type | None:
@ -143,12 +146,12 @@ def _create_dynamic_request_model(


 def _build_signature_params(
-    query_parameters: list[tuple[str, type, Any]],
+    query_parameters: list[QueryParameter],
 ) -> tuple[list[inspect.Parameter], dict[str, type]]:
    """Build signature parameters and annotations from query parameters."""
    signature_params = []
    param_annotations = {}
-    for param_name, param_type, default_value in query_parameters:
+    for param_name, param_type, default_value, _ in query_parameters:
        param_annotations[param_name] = param_type
        signature_params.append(
            inspect.Parameter(
@ -219,6 +222,19 @@ def _is_extra_body_field(metadata_item: Any) -> bool:
    return isinstance(metadata_item, ExtraBodyField)


+def _should_embed_parameter(param_type: Any) -> bool:
+    """Determine whether a parameter should be embedded (wrapped) in the request body."""
+    if get_origin(param_type) is Annotated:
+        args = get_args(param_type)
+        metadata = args[1:] if len(args) > 1 else []
+        for metadata_item in metadata:
+            if isinstance(metadata_item, FastAPIBody):
+                # FastAPI treats embed=None as False, so default to False when unset.
+                return bool(metadata_item.embed)
+    # Unannotated parameters default to embed=True through create_dynamic_typed_route.
+    return True
+
+
 def _is_async_iterator_type(type_obj: Any) -> bool:
    """Check if a type is AsyncIterator or AsyncIterable."""
    from collections.abc import AsyncIterable, AsyncIterator
@ -282,7 +298,7 @@ def _find_models_for_endpoint(

    Returns:
        tuple: (request_model, response_model, query_parameters, file_form_params, streaming_response_model, response_schema_name)
-        where query_parameters is a list of (name, type, default_value) tuples
+        where query_parameters is a list of (name, type, default_value, should_embed) tuples
        and file_form_params is a list of inspect.Parameter objects for File()/Form() params
        and streaming_response_model is the model for streaming responses (AsyncIterator content)
    """
@ -299,7 +315,7 @@ def _find_models_for_endpoint(

        # Find request model and collect all body parameters
        request_model = None
-        query_parameters = []
+        query_parameters: list[QueryParameter] = []
        file_form_params = []
        path_params = set()
        extra_body_params = []
@ -325,6 +341,7 @@ def _find_models_for_endpoint(

            # Check if it's a File() or Form() parameter - these need special handling
            param_type = param.annotation
+            param_should_embed = _should_embed_parameter(param_type)
            if _is_file_or_form_param(param_type):
                # File() and Form() parameters must be in the function signature directly
                # They cannot be part of a Pydantic model
@ -350,30 +367,14 @@ def _find_models_for_endpoint(
                    # Store as extra body parameter - exclude from request model
                    extra_body_params.append((param_name, base_type, extra_body_description))
                    continue
+                param_type = base_type

            # Check if it's a Pydantic model (for POST/PUT requests)
            if hasattr(param_type, "model_json_schema"):
-                # Collect all body parameters including Pydantic models
-                # We'll decide later whether to use a single model or create a combined one
-                query_parameters.append((param_name, param_type, param.default))
-            elif get_origin(param_type) is Annotated:
-                # Handle Annotated types - get the base type
-                args = get_args(param_type)
-                if args and hasattr(args[0], "model_json_schema"):
-                    # Collect Pydantic models from Annotated types
-                    query_parameters.append((param_name, args[0], param.default))
+                query_parameters.append((param_name, param_type, param.default, param_should_embed))
            else:
                # Regular annotated parameter (but not File/Form, already handled above)
-                    query_parameters.append((param_name, param_type, param.default))
-            else:
-                # This is likely a body parameter for POST/PUT or query parameter for GET
-                # Store the parameter info for later use
-                # Preserve inspect.Parameter.empty to distinguish "no default" from "default=None"
-                default_value = param.default
-
-                # Extract the base type from union types (e.g., str | None -> str)
-                # Also make it safe for FastAPI to avoid forward reference issues
-                query_parameters.append((param_name, param_type, default_value))
+                query_parameters.append((param_name, param_type, param.default, param_should_embed))

        # Store extra body fields for later use in post-processing
        # We'll store them when the endpoint is created, as we need the full path
@ -385,8 +386,8 @@ def _find_models_for_endpoint(
        # Otherwise, we'll create a combined request model from all parameters
        # BUT: For GET requests, never create a request body - all parameters should be query parameters
        if is_post_put and len(query_parameters) == 1:
-            param_name, param_type, default_value = query_parameters[0]
-            if hasattr(param_type, "model_json_schema"):
+            param_name, param_type, default_value, should_embed = query_parameters[0]
+            if hasattr(param_type, "model_json_schema") and not should_embed:
                request_model = param_type
                query_parameters = []  # Clear query_parameters so we use the single model

@ -495,7 +496,7 @@ def _create_fastapi_endpoint(app: FastAPI, route, webmethod, api: Api):
    if file_form_params and is_post_put:
        signature_params = list(file_form_params)
        param_annotations = {param.name: param.annotation for param in file_form_params}
-        for param_name, param_type, default_value in query_parameters:
+        for param_name, param_type, default_value, _ in query_parameters:
            signature_params.append(
                inspect.Parameter(
                    param_name,
--- a/scripts/openapi_generator/stainless_config/init.py
+++ b/scripts/openapi_generator/stainless_config/init.py
@ -4,4 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from .kvstore import *  # noqa: F401, F403
+# Package marker for Stainless config generation.
--- a/scripts/openapi_generator/stainless_config/generate_config.py
+++ b/scripts/openapi_generator/stainless_config/generate_config.py
@ -0,0 +1,821 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+HEADER = "# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json\n\n"
+
+SECTION_ORDER = [
+    "organization",
+    "security",
+    "security_schemes",
+    "targets",
+    "client_settings",
+    "environments",
+    "pagination",
+    "settings",
+    "openapi",
+    "readme",
+    "resources",
+]
+
+ORGANIZATION = {
+    "name": "llama-stack-client",
+    "docs": "https://llama-stack.readthedocs.io/en/latest/",
+    "contact": "llamastack@meta.com",
+}
+
+SECURITY = [{}, {"BearerAuth": []}]
+
+SECURITY_SCHEMES = {"BearerAuth": {"type": "http", "scheme": "bearer"}}
+
+TARGETS = {
+    "node": {
+        "package_name": "llama-stack-client",
+        "production_repo": "llamastack/llama-stack-client-typescript",
+        "publish": {"npm": False},
+    },
+    "python": {
+        "package_name": "llama_stack_client",
+        "production_repo": "llamastack/llama-stack-client-python",
+        "options": {"use_uv": True},
+        "publish": {"pypi": True},
+        "project_name": "llama_stack_client",
+    },
+    "kotlin": {
+        "reverse_domain": "com.llama_stack_client.api",
+        "production_repo": None,
+        "publish": {"maven": False},
+    },
+    "go": {
+        "package_name": "llama-stack-client",
+        "production_repo": "llamastack/llama-stack-client-go",
+        "options": {"enable_v2": True, "back_compat_use_shared_package": False},
+    },
+}
+
+CLIENT_SETTINGS = {
+    "default_env_prefix": "LLAMA_STACK_CLIENT",
+    "opts": {
+        "api_key": {
+            "type": "string",
+            "read_env": "LLAMA_STACK_CLIENT_API_KEY",
+            "auth": {"security_scheme": "BearerAuth"},
+            "nullable": True,
+        }
+    },
+}
+
+ENVIRONMENTS = {"production": "http://any-hosted-llama-stack.com"}
+
+PAGINATION = [
+    {
+        "name": "datasets_iterrows",
+        "type": "offset",
+        "request": {
+            "dataset_id": {"type": "string"},
+            "start_index": {
+                "type": "integer",
+                "x-stainless-pagination-property": {"purpose": "offset_count_param"},
+            },
+            "limit": {"type": "integer"},
+        },
+        "response": {
+            "data": {"type": "array", "items": {"type": "object"}},
+            "next_index": {
+                "type": "integer",
+                "x-stainless-pagination-property": {"purpose": "offset_count_start_field"},
+            },
+        },
+    },
+    {
+        "name": "openai_cursor_page",
+        "type": "cursor",
+        "request": {
+            "limit": {"type": "integer"},
+            "after": {
+                "type": "string",
+                "x-stainless-pagination-property": {"purpose": "next_cursor_param"},
+            },
+        },
+        "response": {
+            "data": {"type": "array", "items": {}},
+            "has_more": {"type": "boolean"},
+            "last_id": {
+                "type": "string",
+                "x-stainless-pagination-property": {"purpose": "next_cursor_field"},
+            },
+        },
+    },
+]
+
+SETTINGS = {
+    "license": "MIT",
+    "unwrap_response_fields": ["data"],
+    "file_header": "Copyright (c) Meta Platforms, Inc. and affiliates.\n"
+    "All rights reserved.\n"
+    "\n"
+    "This source code is licensed under the terms described in the "
+    "LICENSE file in\n"
+    "the root directory of this source tree.\n",
+}
+
+OPENAPI = {
+    "transformations": [
+        {
+            "command": "mergeObject",
+            "reason": "Better return_type using enum",
+            "args": {
+                "target": ["$.components.schemas"],
+                "object": {
+                    "ReturnType": {
+                        "additionalProperties": False,
+                        "properties": {
+                            "type": {
+                                "enum": [
+                                    "string",
+                                    "number",
+                                    "boolean",
+                                    "array",
+                                    "object",
+                                    "json",
+                                    "union",
+                                    "chat_completion_input",
+                                    "completion_input",
+                                    "agent_turn_input",
+                                ]
+                            }
+                        },
+                        "required": ["type"],
+                        "type": "object",
+                    }
+                },
+            },
+        },
+        {
+            "command": "replaceProperties",
+            "reason": "Replace return type properties with better model (see above)",
+            "args": {
+                "filter": {
+                    "only": [
+                        "$.components.schemas.ScoringFn.properties.return_type",
+                        "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type",
+                    ]
+                },
+                "value": {"$ref": "#/components/schemas/ReturnType"},
+            },
+        },
+        {
+            "command": "oneOfToAnyOf",
+            "reason": "Prism (mock server) doesn't like one of our "
+            "requests as it technically matches multiple "
+            "variants",
+        },
+    ]
+}
+
+README = {
+    "example_requests": {
+        "default": {
+            "type": "request",
+            "endpoint": "post /v1/chat/completions",
+            "params": {},
+        },
+        "headline": {"type": "request", "endpoint": "get /v1/models", "params": {}},
+        "pagination": {
+            "type": "request",
+            "endpoint": "post /v1/chat/completions",
+            "params": {},
+        },
+    }
+}
+
+ALL_RESOURCES = {
+    "$shared": {
+        "models": {
+            "interleaved_content_item": "InterleavedContentItem",
+            "interleaved_content": "InterleavedContent",
+            "param_type": "ParamType",
+            "safety_violation": "SafetyViolation",
+            "sampling_params": "SamplingParams",
+            "scoring_result": "ScoringResult",
+            "system_message": "SystemMessage",
+        }
+    },
+    "toolgroups": {
+        "models": {
+            "tool_group": "ToolGroup",
+            "list_tool_groups_response": "ListToolGroupsResponse",
+        },
+        "methods": {
+            "register": "post /v1/toolgroups",
+            "get": "get /v1/toolgroups/{toolgroup_id}",
+            "list": "get /v1/toolgroups",
+            "unregister": "delete /v1/toolgroups/{toolgroup_id}",
+        },
+    },
+    "tools": {
+        "methods": {
+            "get": "get /v1/tools/{tool_name}",
+            "list": {"paginated": False, "endpoint": "get /v1/tools"},
+        }
+    },
+    "tool_runtime": {
+        "models": {
+            "tool_def": "ToolDef",
+            "tool_invocation_result": "ToolInvocationResult",
+        },
+        "methods": {
+            "list_tools": {
+                "paginated": False,
+                "endpoint": "get /v1/tool-runtime/list-tools",
+            },
+            "invoke_tool": "post /v1/tool-runtime/invoke",
+        },
+    },
+    "responses": {
+        "models": {
+            "response_object_stream": "OpenAIResponseObjectStream",
+            "response_object": "OpenAIResponseObject",
+        },
+        "methods": {
+            "create": {
+                "type": "http",
+                "streaming": {
+                    "stream_event_model": "responses.response_object_stream",
+                    "param_discriminator": "stream",
+                },
+                "endpoint": "post /v1/responses",
+            },
+            "retrieve": "get /v1/responses/{response_id}",
+            "list": {"type": "http", "endpoint": "get /v1/responses"},
+            "delete": {
+                "type": "http",
+                "endpoint": "delete /v1/responses/{response_id}",
+            },
+        },
+        "subresources": {
+            "input_items": {
+                "methods": {
+                    "list": {
+                        "type": "http",
+                        "paginated": False,
+                        "endpoint": "get /v1/responses/{response_id}/input_items",
+                    }
+                }
+            }
+        },
+    },
+    "prompts": {
+        "models": {"prompt": "Prompt", "list_prompts_response": "ListPromptsResponse"},
+        "methods": {
+            "create": "post /v1/prompts",
+            "list": {"paginated": False, "endpoint": "get /v1/prompts"},
+            "retrieve": "get /v1/prompts/{prompt_id}",
+            "update": "post /v1/prompts/{prompt_id}",
+            "delete": "delete /v1/prompts/{prompt_id}",
+            "set_default_version": "post /v1/prompts/{prompt_id}/set-default-version",
+        },
+        "subresources": {
+            "versions": {
+                "methods": {
+                    "list": {
+                        "paginated": False,
+                        "endpoint": "get /v1/prompts/{prompt_id}/versions",
+                    }
+                }
+            }
+        },
+    },
+    "conversations": {
+        "models": {"conversation_object": "Conversation"},
+        "methods": {
+            "create": {"type": "http", "endpoint": "post /v1/conversations"},
+            "retrieve": "get /v1/conversations/{conversation_id}",
+            "update": {
+                "type": "http",
+                "endpoint": "post /v1/conversations/{conversation_id}",
+            },
+            "delete": {
+                "type": "http",
+                "endpoint": "delete /v1/conversations/{conversation_id}",
+            },
+        },
+        "subresources": {
+            "items": {
+                "methods": {
+                    "get": {
+                        "type": "http",
+                        "endpoint": "get /v1/conversations/{conversation_id}/items/{item_id}",
+                    },
+                    "list": {
+                        "type": "http",
+                        "endpoint": "get /v1/conversations/{conversation_id}/items",
+                    },
+                    "create": {
+                        "type": "http",
+                        "endpoint": "post /v1/conversations/{conversation_id}/items",
+                    },
+                    "delete": {
+                        "type": "http",
+                        "endpoint": "delete /v1/conversations/{conversation_id}/items/{item_id}",
+                    },
+                }
+            }
+        },
+    },
+    "inspect": {
+        "models": {
+            "healthInfo": "HealthInfo",
+            "providerInfo": "ProviderInfo",
+            "routeInfo": "RouteInfo",
+            "versionInfo": "VersionInfo",
+        },
+        "methods": {"health": "get /v1/health", "version": "get /v1/version"},
+    },
+    "embeddings": {
+        "models": {"create_embeddings_response": "OpenAIEmbeddingsResponse"},
+        "methods": {"create": "post /v1/embeddings"},
+    },
+    "chat": {
+        "models": {"chat_completion_chunk": "OpenAIChatCompletionChunk"},
+        "subresources": {
+            "completions": {
+                "methods": {
+                    "create": {
+                        "type": "http",
+                        "streaming": {
+                            "stream_event_model": "chat.chat_completion_chunk",
+                            "param_discriminator": "stream",
+                        },
+                        "endpoint": "post /v1/chat/completions",
+                    },
+                    "list": {
+                        "type": "http",
+                        "paginated": False,
+                        "endpoint": "get /v1/chat/completions",
+                    },
+                    "retrieve": {
+                        "type": "http",
+                        "endpoint": "get /v1/chat/completions/{completion_id}",
+                    },
+                }
+            }
+        },
+    },
+    "completions": {
+        "methods": {
+            "create": {
+                "type": "http",
+                "streaming": {"param_discriminator": "stream"},
+                "endpoint": "post /v1/completions",
+            }
+        }
+    },
+    "vector_io": {
+        "models": {"queryChunksResponse": "QueryChunksResponse"},
+        "methods": {
+            "insert": "post /v1/vector-io/insert",
+            "query": "post /v1/vector-io/query",
+        },
+    },
+    "vector_stores": {
+        "models": {
+            "vector_store": "VectorStoreObject",
+            "list_vector_stores_response": "VectorStoreListResponse",
+            "vector_store_delete_response": "VectorStoreDeleteResponse",
+            "vector_store_search_response": "VectorStoreSearchResponsePage",
+        },
+        "methods": {
+            "create": "post /v1/vector_stores",
+            "list": "get /v1/vector_stores",
+            "retrieve": "get /v1/vector_stores/{vector_store_id}",
+            "update": "post /v1/vector_stores/{vector_store_id}",
+            "delete": "delete /v1/vector_stores/{vector_store_id}",
+            "search": "post /v1/vector_stores/{vector_store_id}/search",
+        },
+        "subresources": {
+            "files": {
+                "models": {"vector_store_file": "VectorStoreFileObject"},
+                "methods": {
+                    "list": "get /v1/vector_stores/{vector_store_id}/files",
+                    "retrieve": "get /v1/vector_stores/{vector_store_id}/files/{file_id}",
+                    "update": "post /v1/vector_stores/{vector_store_id}/files/{file_id}",
+                    "delete": "delete /v1/vector_stores/{vector_store_id}/files/{file_id}",
+                    "create": "post /v1/vector_stores/{vector_store_id}/files",
+                    "content": "get /v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+                },
+            },
+            "file_batches": {
+                "models": {
+                    "vector_store_file_batches": "VectorStoreFileBatchObject",
+                    "list_vector_store_files_in_batch_response": "VectorStoreFilesListInBatchResponse",
+                },
+                "methods": {
+                    "create": "post /v1/vector_stores/{vector_store_id}/file_batches",
+                    "retrieve": "get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}",
+                    "list_files": "get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
+                    "cancel": "post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
+                },
+            },
+        },
+    },
+    "models": {
+        "models": {
+            "model": "OpenAIModel",
+            "list_models_response": "OpenAIListModelsResponse",
+        },
+        "methods": {
+            "list": {"paginated": False, "endpoint": "get /v1/models"},
+            "retrieve": "get /v1/models/{model_id}",
+            "register": "post /v1/models",
+            "unregister": "delete /v1/models/{model_id}",
+        },
+        "subresources": {"openai": {"methods": {"list": {"paginated": False, "endpoint": "get /v1/models"}}}},
+    },
+    "providers": {
+        "models": {"list_providers_response": "ListProvidersResponse"},
+        "methods": {
+            "list": {"paginated": False, "endpoint": "get /v1/providers"},
+            "retrieve": "get /v1/providers/{provider_id}",
+        },
+    },
+    "routes": {
+        "models": {"list_routes_response": "ListRoutesResponse"},
+        "methods": {"list": {"paginated": False, "endpoint": "get /v1/inspect/routes"}},
+    },
+    "moderations": {
+        "models": {"create_response": "ModerationObject"},
+        "methods": {"create": "post /v1/moderations"},
+    },
+    "safety": {
+        "models": {"run_shield_response": "RunShieldResponse"},
+        "methods": {"run_shield": "post /v1/safety/run-shield"},
+    },
+    "shields": {
+        "models": {"shield": "Shield", "list_shields_response": "ListShieldsResponse"},
+        "methods": {
+            "retrieve": "get /v1/shields/{identifier}",
+            "list": {"paginated": False, "endpoint": "get /v1/shields"},
+            "register": "post /v1/shields",
+            "delete": "delete /v1/shields/{identifier}",
+        },
+    },
+    "scoring": {
+        "methods": {
+            "score": "post /v1/scoring/score",
+            "score_batch": "post /v1/scoring/score-batch",
+        }
+    },
+    "scoring_functions": {
+        "models": {
+            "scoring_fn": "ScoringFn",
+            "scoring_fn_params": "ScoringFnParams",
+            "list_scoring_functions_response": "ListScoringFunctionsResponse",
+        },
+        "methods": {
+            "retrieve": "get /v1/scoring-functions/{scoring_fn_id}",
+            "list": {"paginated": False, "endpoint": "get /v1/scoring-functions"},
+            "register": "post /v1/scoring-functions",
+            "unregister": "delete /v1/scoring-functions/{scoring_fn_id}",
+        },
+    },
+    "files": {
+        "models": {
+            "file": "OpenAIFileObject",
+            "list_files_response": "ListOpenAIFileResponse",
+            "delete_file_response": "OpenAIFileDeleteResponse",
+        },
+        "methods": {
+            "create": "post /v1/files",
+            "list": "get /v1/files",
+            "retrieve": "get /v1/files/{file_id}",
+            "delete": "delete /v1/files/{file_id}",
+            "content": "get /v1/files/{file_id}/content",
+        },
+    },
+    "batches": {
+        "methods": {
+            "create": "post /v1/batches",
+            "list": "get /v1/batches",
+            "retrieve": "get /v1/batches/{batch_id}",
+            "cancel": "post /v1/batches/{batch_id}/cancel",
+        }
+    },
+    "alpha": {
+        "subresources": {
+            "inference": {"methods": {"rerank": "post /v1alpha/inference/rerank"}},
+            "post_training": {
+                "models": {
+                    "algorithm_config": "AlgorithmConfig",
+                    "post_training_job": "PostTrainingJob",
+                    "list_post_training_jobs_response": "ListPostTrainingJobsResponse",
+                },
+                "methods": {
+                    "preference_optimize": "post /v1alpha/post-training/preference-optimize",
+                    "supervised_fine_tune": "post /v1alpha/post-training/supervised-fine-tune",
+                },
+                "subresources": {
+                    "job": {
+                        "methods": {
+                            "artifacts": "get /v1alpha/post-training/job/artifacts",
+                            "cancel": "post /v1alpha/post-training/job/cancel",
+                            "status": "get /v1alpha/post-training/job/status",
+                            "list": {
+                                "paginated": False,
+                                "endpoint": "get /v1alpha/post-training/jobs",
+                            },
+                        }
+                    }
+                },
+            },
+            "benchmarks": {
+                "models": {
+                    "benchmark": "Benchmark",
+                    "list_benchmarks_response": "ListBenchmarksResponse",
+                },
+                "methods": {
+                    "retrieve": "get /v1alpha/eval/benchmarks/{benchmark_id}",
+                    "list": {
+                        "paginated": False,
+                        "endpoint": "get /v1alpha/eval/benchmarks",
+                    },
+                    "register": "post /v1alpha/eval/benchmarks",
+                    "unregister": "delete /v1alpha/eval/benchmarks/{benchmark_id}",
+                },
+            },
+            "eval": {
+                "models": {
+                    "evaluate_response": "EvaluateResponse",
+                    "benchmark_config": "BenchmarkConfig",
+                    "job": "Job",
+                },
+                "methods": {
+                    "evaluate_rows": "post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations",
+                    "run_eval": "post /v1alpha/eval/benchmarks/{benchmark_id}/jobs",
+                    "evaluate_rows_alpha": "post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations",
+                    "run_eval_alpha": "post /v1alpha/eval/benchmarks/{benchmark_id}/jobs",
+                },
+                "subresources": {
+                    "jobs": {
+                        "methods": {
+                            "cancel": "delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
+                            "status": "get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
+                            "retrieve": "get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
+                        }
+                    }
+                },
+            },
+        }
+    },
+    "beta": {
+        "subresources": {
+            "datasets": {
+                "models": {"list_datasets_response": "ListDatasetsResponse"},
+                "methods": {
+                    "register": "post /v1beta/datasets",
+                    "retrieve": "get /v1beta/datasets/{dataset_id}",
+                    "list": {"paginated": False, "endpoint": "get /v1beta/datasets"},
+                    "unregister": "delete /v1beta/datasets/{dataset_id}",
+                    "iterrows": "get /v1beta/datasetio/iterrows/{dataset_id}",
+                    "appendrows": "post /v1beta/datasetio/append-rows/{dataset_id}",
+                },
+            }
+        }
+    },
+}
+
+
+HTTP_METHODS = {"get", "post", "put", "patch", "delete", "options", "head"}
+
+
+@dataclass
+class Endpoint:
+    method: str
+    path: str
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_config(cls, value: Any) -> Endpoint:
+        if isinstance(value, str):
+            method, _, path = value.partition(" ")
+            return cls._from_parts(method, path)
+        if isinstance(value, dict) and "endpoint" in value:
+            method, _, path = value["endpoint"].partition(" ")
+            extra = {k: v for k, v in value.items() if k != "endpoint"}
+            endpoint = cls._from_parts(method, path)
+            endpoint.extra.update(extra)
+            return endpoint
+        raise ValueError(f"Unsupported endpoint value: {value!r}")
+
+    @classmethod
+    def _from_parts(cls, method: str, path: str) -> Endpoint:
+        method = method.strip().lower()
+        path = path.strip()
+        if method not in HTTP_METHODS:
+            raise ValueError(f"Unsupported HTTP method for Stainless config: {method!r}")
+        if not path.startswith("/"):
+            raise ValueError(f"Endpoint path must start with '/': {path!r}")
+        return cls(method=method, path=path)
+
+    def to_config(self) -> Any:
+        if not self.extra:
+            return f"{self.method} {self.path}"
+        data = dict(self.extra)
+        data["endpoint"] = f"{self.method} {self.path}"
+        return data
+
+    def route_key(self) -> str:
+        return f"{self.method} {self.path}"
+
+
+@dataclass
+class Resource:
+    models: dict[str, str] | None = None
+    methods: dict[str, Endpoint] = field(default_factory=dict)
+    subresources: dict[str, Resource] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> Resource:
+        models = data.get("models")
+        methods = {name: Endpoint.from_config(value) for name, value in data.get("methods", {}).items()}
+        subresources = {name: cls.from_dict(value) for name, value in data.get("subresources", {}).items()}
+        return cls(models=models, methods=methods, subresources=subresources)
+
+    def to_config(self) -> dict[str, Any]:
+        result: dict[str, Any] = {}
+        if self.models:
+            result["models"] = self.models
+        if self.methods:
+            result["methods"] = {name: endpoint.to_config() for name, endpoint in self.methods.items()}
+        if self.subresources:
+            result["subresources"] = {name: resource.to_config() for name, resource in self.subresources.items()}
+        return result
+
+    def collect_endpoint_paths(self) -> set[str]:
+        paths = {endpoint.route_key() for endpoint in self.methods.values()}
+        for subresource in self.subresources.values():
+            paths.update(subresource.collect_endpoint_paths())
+        return paths
+
+    def iter_endpoints(self, prefix: str) -> Iterator[tuple[str, str]]:
+        for method_name, endpoint in self.methods.items():
+            label = f"{prefix}.{method_name}" if prefix else method_name
+            yield endpoint.route_key(), label
+        for sub_name, subresource in self.subresources.items():
+            sub_prefix = f"{prefix}.{sub_name}" if prefix else sub_name
+            yield from subresource.iter_endpoints(sub_prefix)
+
+
+_RESOURCES = {name: Resource.from_dict(data) for name, data in ALL_RESOURCES.items()}
+
+
+def _load_openapi_paths(openapi_path: Path) -> set[str]:
+    spec = yaml.safe_load(openapi_path.read_text()) or {}
+    paths: set[str] = set()
+    for path, path_item in (spec.get("paths") or {}).items():
+        if not isinstance(path_item, dict):
+            continue
+        for method, operation in path_item.items():
+            if not isinstance(operation, dict):
+                continue
+            paths.add(f"{str(method).lower()} {path}")
+    return paths
+
+
+@dataclass(frozen=True)
+class StainlessConfig:
+    organization: dict[str, Any]
+    security: list[Any]
+    security_schemes: dict[str, Any]
+    targets: dict[str, Any]
+    client_settings: dict[str, Any]
+    environments: dict[str, Any]
+    pagination: list[dict[str, Any]]
+    settings: dict[str, Any]
+    openapi: dict[str, Any]
+    readme: dict[str, Any]
+    resources: dict[str, Resource]
+
+    @classmethod
+    def make(cls) -> StainlessConfig:
+        return cls(
+            organization=ORGANIZATION,
+            security=SECURITY,
+            security_schemes=SECURITY_SCHEMES,
+            targets=TARGETS,
+            client_settings=CLIENT_SETTINGS,
+            environments=ENVIRONMENTS,
+            pagination=PAGINATION,
+            settings=SETTINGS,
+            openapi=OPENAPI,
+            readme=README,
+            resources=dict(_RESOURCES),
+        )
+
+    def referenced_paths(self) -> set[str]:
+        paths: set[str] = set()
+        for resource in self.resources.values():
+            paths.update(resource.collect_endpoint_paths())
+        paths.update(self.readme_endpoint_paths())
+        return paths
+
+    def readme_endpoint_paths(self) -> set[str]:
+        example_requests = self.readme.get("example_requests", {}) if self.readme else {}
+        paths: set[str] = set()
+        for entry in example_requests.values():
+            endpoint = entry.get("endpoint") if isinstance(entry, dict) else None
+            if isinstance(endpoint, str):
+                method, _, route = endpoint.partition(" ")
+                method = method.strip().lower()
+                route = route.strip()
+                if method and route:
+                    paths.add(f"{method} {route}")
+        return paths
+
+    def endpoint_map(self) -> dict[str, list[str]]:
+        mapping: dict[str, list[str]] = {}
+        for resource_name, resource in self.resources.items():
+            for route, label in resource.iter_endpoints(resource_name):
+                mapping.setdefault(route, []).append(label)
+        return mapping
+
+    def validate_unique_endpoints(self) -> None:
+        duplicates: dict[str, list[str]] = {}
+        for route, labels in self.endpoint_map().items():
+            top_levels = {label.split(".", 1)[0] for label in labels}
+            if len(top_levels) > 1:
+                duplicates[route] = labels
+        if duplicates:
+            formatted = "\n".join(
+                f"  - {route} defined in: {', '.join(sorted(labels))}" for route, labels in sorted(duplicates.items())
+            )
+            raise ValueError("Duplicate endpoints found across resources:\n" + formatted)
+
+    def validate_readme_endpoints(self) -> None:
+        resource_paths: set[str] = set()
+        for resource in self.resources.values():
+            resource_paths.update(resource.collect_endpoint_paths())
+        missing = sorted(path for path in self.readme_endpoint_paths() if path not in resource_paths)
+        if missing:
+            formatted = "\n".join(f"  - {path}" for path in missing)
+            raise ValueError("README example endpoints are not present in Stainless resources:\n" + formatted)
+
+    def to_dict(self) -> dict[str, Any]:
+        cfg: dict[str, Any] = {}
+        for section in SECTION_ORDER:
+            if section == "resources":
+                cfg[section] = {name: resource.to_config() for name, resource in self.resources.items()}
+                continue
+            cfg[section] = getattr(self, section)
+        return cfg
+
+    def validate_against_openapi(self, openapi_path: Path) -> None:
+        if not openapi_path.exists():
+            raise FileNotFoundError(f"OpenAPI spec not found at {openapi_path}")
+        spec_paths = _load_openapi_paths(openapi_path)
+        config_paths = self.referenced_paths()
+        missing = sorted(path for path in config_paths if path not in spec_paths)
+        if missing:
+            formatted = "\n".join(f"  - {path}" for path in missing)
+            raise ValueError("Stainless config references missing endpoints:\n" + formatted)
+
+    def validate(self, openapi_path: Path | None = None) -> None:
+        self.validate_unique_endpoints()
+        self.validate_readme_endpoints()
+        if openapi_path is not None:
+            self.validate_against_openapi(openapi_path)
+
+
+def build_config() -> dict[str, Any]:
+    return StainlessConfig.make().to_dict()
+
+
+def write_config(repo_root: Path, openapi_path: Path | None = None) -> Path:
+    stainless_config = StainlessConfig.make()
+    spec_path = (openapi_path or (repo_root / "client-sdks" / "stainless" / "openapi.yml")).resolve()
+    stainless_config.validate(spec_path)
+    yaml_text = yaml.safe_dump(stainless_config.to_dict(), sort_keys=False)
+    output = repo_root / "client-sdks" / "stainless" / "config.yml"
+    output.write_text(HEADER + yaml_text)
+    return output
+
+
+def main() -> None:
+    repo_root = Path(__file__).resolve().parents[3]
+    output = write_config(repo_root)
+    print(f"Wrote Stainless config: {output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/provider_codegen.py
+++ b/scripts/provider_codegen.py
@ -8,7 +8,8 @@
 import subprocess
 import sys
 from pathlib import Path
-from typing import Any
+from types import UnionType
+from typing import Annotated, Any, Union, get_args, get_origin

 from pydantic_core import PydanticUndefined
 from rich.progress import Progress, SpinnerColumn, TextColumn
@ -51,6 +52,41 @@ class ChangedPathTracker:
        return self._changed_paths


+def extract_type_annotation(annotation: Any) -> str:
+    """extract a type annotation into a clean string representation."""
+    if annotation is None:
+        return "Any"
+
+    if annotation is type(None):
+        return "None"
+
+    origin = get_origin(annotation)
+    args = get_args(annotation)
+
+    # recursive workaround for Annotated types to ignore FieldInfo part
+    if origin is Annotated and args:
+        return extract_type_annotation(args[0])
+
+    if origin in [Union, UnionType]:
+        non_none_args = [arg for arg in args if arg is not type(None)]
+        has_none = len(non_none_args) < len(args)
+
+        if len(non_none_args) == 1:
+            formatted = extract_type_annotation(non_none_args[0])
+            return f"{formatted} | None" if has_none else formatted
+        else:
+            formatted_args = [extract_type_annotation(arg) for arg in non_none_args]
+            result = " | ".join(formatted_args)
+            return f"{result} | None" if has_none else result
+
+    if origin is not None and args:
+        origin_name = getattr(origin, "__name__", str(origin))
+        formatted_args = [extract_type_annotation(arg) for arg in args]
+        return f"{origin_name}[{', '.join(formatted_args)}]"
+
+    return annotation.__name__ if hasattr(annotation, "__name__") else str(annotation)
+
+
 def get_config_class_info(config_class_path: str) -> dict[str, Any]:
    """Extract configuration information from a config class."""
    try:
@ -78,14 +114,8 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
            for field_name, field in config_class.model_fields.items():
                if getattr(field, "exclude", False):
                    continue
-                field_type = str(field.annotation) if field.annotation else "Any"

-                # this string replace is ridiculous
-                field_type = field_type.replace("typing.", "").replace("Optional[", "").replace("]", "")
-                field_type = field_type.replace("Annotated[", "").replace("FieldInfo(", "").replace(")", "")
-                field_type = field_type.replace("llama_stack_api.inference.", "")
-                field_type = field_type.replace("llama_stack.providers.", "")
-                field_type = field_type.replace("llama_stack_api.datatypes.", "")
+                field_type = extract_type_annotation(field.annotation)

                default_value = field.default
                if field.default_factory is not None:
@ -345,6 +375,14 @@ def generate_index_docs(api_name: str, api_docstring: str | None, provider_entri
    # Add YAML frontmatter for index
    md_lines.append("---")
    if api_docstring:
+        # Handle multi-line descriptions in YAML
+        if "\n" in api_docstring.strip():
+            md_lines.append("description: |")
+            for line in api_docstring.strip().split("\n"):
+                # Avoid trailing whitespace by only adding spaces to non-empty lines
+                md_lines.append(f"  {line}" if line.strip() else "")
+        else:
+            # For single line descriptions, format properly for YAML
            clean_desc = api_docstring.strip().replace('"', '\\"')
            md_lines.append(f'description: "{clean_desc}"')
    md_lines.append(f"sidebar_label: {sidebar_label}")
--- a/scripts/run_openapi_generator.sh
+++ b/scripts/run_openapi_generator.sh
@ -17,3 +17,5 @@ PYTHONPATH=$PYTHONPATH:$stack_dir \
  python3 -m scripts.openapi_generator "$stack_dir"/docs/static

 cp "$stack_dir"/docs/static/stainless-llama-stack-spec.yaml "$stack_dir"/client-sdks/stainless/openapi.yml
+PYTHONPATH=$PYTHONPATH:$stack_dir \
+  python3 -m scripts.openapi_generator.stainless_config.generate_config
--- a/src/llama_stack/core/conversations/conversations.py
+++ b/src/llama_stack/core/conversations/conversations.py
@ -11,10 +11,9 @@ from typing import Any, Literal
 from pydantic import BaseModel, TypeAdapter

 from llama_stack.core.datatypes import AccessRule, StackRunConfig
+from llama_stack.core.storage.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.core.storage.sqlstore.sqlstore import sqlstore_impl
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
 from llama_stack_api import (
    Conversation,
    ConversationDeletedResource,
@ -25,6 +24,7 @@ from llama_stack_api import (
    Conversations,
    Metadata,
 )
+from llama_stack_api.internal.sqlstore import ColumnDefinition, ColumnType

 logger = get_logger(name=__name__, category="openai_conversations")

--- a/src/llama_stack/core/prompts/prompts.py
+++ b/src/llama_stack/core/prompts/prompts.py
@ -10,7 +10,7 @@ from typing import Any
 from pydantic import BaseModel

 from llama_stack.core.datatypes import StackRunConfig
-from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
+from llama_stack.core.storage.kvstore import KVStore, kvstore_impl
 from llama_stack_api import ListPromptsResponse, Prompt, Prompts


--- a/src/llama_stack/core/server/quota.py
+++ b/src/llama_stack/core/server/quota.py
@ -11,9 +11,9 @@ from datetime import UTC, datetime, timedelta
 from starlette.types import ASGIApp, Receive, Scope, Send

 from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendType
+from llama_stack.core.storage.kvstore.kvstore import _KVSTORE_BACKENDS, kvstore_impl
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.kvstore.kvstore import _KVSTORE_BACKENDS, kvstore_impl
+from llama_stack_api.internal.kvstore import KVStore

 logger = get_logger(name=__name__, category="core::server")

--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -385,8 +385,8 @@ def _initialize_storage(run_config: StackRunConfig):
        else:
            raise ValueError(f"Unknown storage backend type: {type}")

-    from llama_stack.providers.utils.kvstore.kvstore import register_kvstore_backends
-    from llama_stack.providers.utils.sqlstore.sqlstore import register_sqlstore_backends
+    from llama_stack.core.storage.kvstore.kvstore import register_kvstore_backends
+    from llama_stack.core.storage.sqlstore.sqlstore import register_sqlstore_backends

    register_kvstore_backends(kv_backends)
    register_sqlstore_backends(sql_backends)
--- a/src/llama_stack/core/storage/datatypes.py
+++ b/src/llama_stack/core/storage/datatypes.py
@ -12,6 +12,8 @@ from typing import Annotated, Literal

 from pydantic import BaseModel, Field, field_validator

+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
+

 class StorageBackendType(StrEnum):
    KV_REDIS = "kv_redis"
@ -256,15 +258,24 @@ class ResponsesStoreReference(InferenceStoreReference):

 class ServerStoresConfig(BaseModel):
    metadata: KVStoreReference | None = Field(
-        default=None,
+        default=KVStoreReference(
+            backend="kv_default",
+            namespace="registry",
+        ),
        description="Metadata store configuration (uses KV backend)",
    )
    inference: InferenceStoreReference | None = Field(
-        default=None,
+        default=InferenceStoreReference(
+            backend="sql_default",
+            table_name="inference_store",
+        ),
        description="Inference store configuration (uses SQL backend)",
    )
    conversations: SqlStoreReference | None = Field(
-        default=None,
+        default=SqlStoreReference(
+            backend="sql_default",
+            table_name="openai_conversations",
+        ),
        description="Conversations store configuration (uses SQL backend)",
    )
    responses: ResponsesStoreReference | None = Field(
@ -272,13 +283,21 @@ class ServerStoresConfig(BaseModel):
        description="Responses store configuration (uses SQL backend)",
    )
    prompts: KVStoreReference | None = Field(
-        default=None,
+        default=KVStoreReference(backend="kv_default", namespace="prompts"),
        description="Prompts store configuration (uses KV backend)",
    )


 class StorageConfig(BaseModel):
    backends: dict[str, StorageBackendConfig] = Field(
+        default={
+            "kv_default": SqliteKVStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={DISTRIBS_BASE_DIR}}}/kvstore.db",
+            ),
+            "sql_default": SqliteSqlStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={DISTRIBS_BASE_DIR}}}/sql_store.db",
+            ),
+        },
        description="Named backend configurations (e.g., 'default', 'cache')",
    )
    stores: ServerStoresConfig = Field(
--- a/src/llama_stack/core/storage/kvstore/init.py
+++ b/src/llama_stack/core/storage/kvstore/init.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack_api.internal.kvstore import KVStore as KVStore
+
+from .kvstore import *  # noqa: F401, F403
--- a/src/llama_stack/providers/utils/kvstore/config.py
+++ b/src/llama_stack/providers/utils/kvstore/config.py
--- a/src/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/src/llama_stack/providers/utils/kvstore/kvstore.py
@ -13,11 +13,19 @@ from __future__ import annotations

 import asyncio
 from collections import defaultdict
+from datetime import datetime
+from typing import cast

-from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendConfig, StorageBackendType
+from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendConfig
+from llama_stack_api.internal.kvstore import KVStore

-from .api import KVStore
-from .config import KVStoreConfig
+from .config import (
+    KVStoreConfig,
+    MongoDBKVStoreConfig,
+    PostgresKVStoreConfig,
+    RedisKVStoreConfig,
+    SqliteKVStoreConfig,
+)


 def kvstore_dependencies():
@ -33,7 +41,7 @@ def kvstore_dependencies():

 class InmemoryKVStoreImpl(KVStore):
    def __init__(self):
-        self._store = {}
+        self._store: dict[str, str] = {}

    async def initialize(self) -> None:
        pass
@ -41,7 +49,7 @@ class InmemoryKVStoreImpl(KVStore):
    async def get(self, key: str) -> str | None:
        return self._store.get(key)

-    async def set(self, key: str, value: str) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
        self._store[key] = value

    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
@ -70,7 +78,8 @@ def register_kvstore_backends(backends: dict[str, StorageBackendConfig]) -> None
    _KVSTORE_INSTANCES.clear()
    _KVSTORE_LOCKS.clear()
    for name, cfg in backends.items():
-        _KVSTORE_BACKENDS[name] = cfg
+        typed_cfg = cast(KVStoreConfig, cfg)
+        _KVSTORE_BACKENDS[name] = typed_cfg


 async def kvstore_impl(reference: KVStoreReference) -> KVStore:
@ -94,19 +103,20 @@ async def kvstore_impl(reference: KVStoreReference) -> KVStore:
        config = backend_config.model_copy()
        config.namespace = reference.namespace

-        if config.type == StorageBackendType.KV_REDIS.value:
+        impl: KVStore
+        if isinstance(config, RedisKVStoreConfig):
            from .redis import RedisKVStoreImpl

            impl = RedisKVStoreImpl(config)
-        elif config.type == StorageBackendType.KV_SQLITE.value:
+        elif isinstance(config, SqliteKVStoreConfig):
            from .sqlite import SqliteKVStoreImpl

            impl = SqliteKVStoreImpl(config)
-        elif config.type == StorageBackendType.KV_POSTGRES.value:
+        elif isinstance(config, PostgresKVStoreConfig):
            from .postgres import PostgresKVStoreImpl

            impl = PostgresKVStoreImpl(config)
-        elif config.type == StorageBackendType.KV_MONGODB.value:
+        elif isinstance(config, MongoDBKVStoreConfig):
            from .mongodb import MongoDBKVStoreImpl

            impl = MongoDBKVStoreImpl(config)
--- a/Show more
+++ b/Show more