Merge branch 'main' into custom-collection-name-vectordb

2025-12-03 01:48:05 +00:00 · 2025-11-20 19:37:06 +02:00 · 2025-11-20 19:37:06 +02:00 · 3f5576b7d6
commit 3f5576b7d6
parent 6e6ddd3c69 acf74cb8df
117 changed files with 16294 additions and 769 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
+* @ashwinb @raghotham @ehhuang @leseb @bbrowning @mattf @franciscojavierarceo
--- a/.github/actions/setup-typescript-client/action.yml
+++ b/.github/actions/setup-typescript-client/action.yml
@ -0,0 +1,35 @@
 name: Setup TypeScript client
 description: Conditionally checkout and link llama-stack-client-typescript based on client-version
 inputs:
  client-version:
    description: 'Client version (latest or published)'
    required: true
 outputs:
  ts-client-path:
    description: 'Path or version to use for TypeScript client'
    value: ${{ steps.set-path.outputs.ts-client-path }}
 runs:
  using: "composite"
  steps:
    - name: Checkout TypeScript client (latest)
      if: ${{ inputs.client-version == 'latest' }}
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
      with:
        repository: llamastack/llama-stack-client-typescript
        ref: main
        path: .ts-client-checkout
    - name: Set TS_CLIENT_PATH
      id: set-path
      shell: bash
      run: |
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "ts-client-path=${{ github.workspace }}/.ts-client-checkout" >> $GITHUB_OUTPUT
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "ts-client-path=^0.3.2" >> $GITHUB_OUTPUT
        else
          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -93,11 +93,27 @@ jobs:
          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'
      - name: Setup Node.js for TypeScript client tests
        if: ${{ matrix.client == 'server' }}
        uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
        with:
          node-version: '20'
          cache: 'npm'
          cache-dependency-path: tests/integration/client-typescript/package-lock.json
      - name: Setup TypeScript client
        if: ${{ matrix.client == 'server' }}
        id: setup-ts-client
        uses: ./.github/actions/setup-typescript-client
        with:
          client-version: ${{ matrix.client-version }}
      - name: Run tests
        if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
        uses: ./.github/actions/run-and-record-tests
        env:
          OPENAI_API_KEY: dummy
          TS_CLIENT_PATH: ${{ steps.setup-ts-client.outputs.ts-client-path || '' }}
        with:
          stack-config: >-
            ${{ matrix.config.stack_config
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -43,7 +43,41 @@ env:
  #   Stainless organization dashboard
 jobs:
  compute-branch:
    runs-on: ubuntu-latest
    outputs:
      preview_branch: ${{ steps.compute.outputs.preview_branch }}
      base_branch: ${{ steps.compute.outputs.base_branch }}
      merge_branch: ${{ steps.compute.outputs.merge_branch }}
    steps:
      - name: Compute branch names
        id: compute
        run: |
          HEAD_REPO="${{ github.event.pull_request.head.repo.full_name }}"
          BASE_REPO="${{ github.repository }}"
          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
          FORK_OWNER="${{ github.event.pull_request.head.repo.owner.login }}"
          if [ "$HEAD_REPO" != "$BASE_REPO" ]; then
            # Fork PR: prefix with fork owner for isolation
            if [ -z "$FORK_OWNER" ]; then
              echo "Error: Fork PR detected but fork owner is empty" >&2
              exit 1
            fi
            PREVIEW_BRANCH="preview/${FORK_OWNER}/${BRANCH_NAME}"
            BASE_BRANCH="preview/base/${FORK_OWNER}/${BRANCH_NAME}"
          else
            # Same-repo PR
            PREVIEW_BRANCH="preview/${BRANCH_NAME}"
            BASE_BRANCH="preview/base/${BRANCH_NAME}"
          fi
          echo "preview_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
          echo "base_branch=${BASE_BRANCH}" >> $GITHUB_OUTPUT
          echo "merge_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
  preview:
    needs: compute-branch
    if: github.event.action != 'closed'
    runs-on: ubuntu-latest
    permissions:
@ -59,8 +93,6 @@ jobs:
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2
      # This action builds preview SDKs from the OpenAPI spec changes and
      # posts/updates a comment on the PR with build results and links to the preview.
      - name: Run preview builds
        uses: stainless-api/upload-openapi-spec-action/preview@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
@ -73,8 +105,11 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          branch: ${{ needs.compute-branch.outputs.preview_branch }}
          base_branch: ${{ needs.compute-branch.outputs.base_branch }}
  merge:
    needs: compute-branch
    if: github.event.action == 'closed' && github.event.pull_request.merged == true
    runs-on: ubuntu-latest
    permissions:
@ -91,11 +126,11 @@ jobs:
          fetch-depth: 2
      # Note that this only merges in changes that happened on the last build on
-      # preview/${{ github.head_ref }}. It's possible that there are OAS/config
+      # the computed preview branch. It's possible that there are OAS/config
-      # changes that haven't been built, if the preview-sdk job didn't finish
+      # changes that haven't been built, if the preview job didn't finish
      # before this step starts. In theory we want to wait for all builds
-      # against preview/${{ github.head_ref }} to complete, but assuming that
+      # against the preview branch to complete, but assuming that
-      # the preview-sdk job happens before the PR merge, it should be fine.
+      # the preview job happens before the PR merge, it should be fine.
      - name: Run merge build
        uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
@ -108,3 +143,4 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          merge_branch: ${{ needs.compute-branch.outputs.merge_branch }}
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,5 @@ docs/static/imported-files/
 docs/docs/api-deprecated/
 docs/docs/api-experimental/
 docs/docs/api/
 tests/integration/client-typescript/node_modules/
 .ts-client-checkout/
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -9862,9 +9862,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
            anyOf:
            - type: string
              maxLength: 512
            - type: number
            - type: boolean
            title: string | number | boolean
          propertyNames:
            type: string
            maxLength: 64
          type: object
          maxProperties: 16
          title: Attributes
          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -24,7 +24,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `api_base` | `HttpUrl` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
+| `base_url` | `HttpUrl \| None` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
@ -32,7 +32,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 ```yaml
 api_key: ${env.AZURE_API_KEY:=}
-api_base: ${env.AZURE_API_BASE:=}
+base_url: ${env.AZURE_API_BASE:=}
 api_version: ${env.AZURE_API_VERSION:=}
 api_type: ${env.AZURE_API_TYPE:=}
 ```
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -17,11 +17,11 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `str` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
+| `base_url` | `HttpUrl \| None` | No | https://api.cerebras.ai/v1 | Base URL for the Cerebras API |
 ## Sample Configuration
 ```yaml
-base_url: https://api.cerebras.ai
+base_url: https://api.cerebras.ai/v1
 api_key: ${env.CEREBRAS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -17,11 +17,11 @@ Databricks inference provider for running models on Databricks' unified analytic
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The Databricks API token |
-| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Databricks model serving endpoint (should include /serving-endpoints path) |
 ## Sample Configuration
 ```yaml
-url: ${env.DATABRICKS_HOST:=}
+base_url: ${env.DATABRICKS_HOST:=}
 api_token: ${env.DATABRICKS_TOKEN:=}
 ```
--- a/docs/docs/providers/inference/remote_fireworks.mdx
+++ b/docs/docs/providers/inference/remote_fireworks.mdx
@ -17,11 +17,11 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
+| `base_url` | `HttpUrl \| None` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
 ## Sample Configuration
 ```yaml
-url: https://api.fireworks.ai/inference/v1
+base_url: https://api.fireworks.ai/inference/v1
 api_key: ${env.FIREWORKS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -17,11 +17,11 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.groq.com | The URL for the Groq AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.groq.com/openai/v1 | The URL for the Groq AI server |
 ## Sample Configuration
 ```yaml
-url: https://api.groq.com
+base_url: https://api.groq.com/openai/v1
 api_key: ${env.GROQ_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -17,11 +17,11 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `openai_compat_api_base` | `str` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
+| `base_url` | `HttpUrl \| None` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
 ## Sample Configuration
 ```yaml
-openai_compat_api_base: https://api.llama.com/compat/v1/
+base_url: https://api.llama.com/compat/v1/
 api_key: ${env.LLAMA_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -17,15 +17,13 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
+| `base_url` | `HttpUrl \| None` | No | https://integrate.api.nvidia.com/v1 | A base url for accessing the NVIDIA NIM |
 | `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
 | `append_api_version` | `bool` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
 | `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |
 ## Sample Configuration
 ```yaml
-url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
 api_key: ${env.NVIDIA_API_KEY:=}
 append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
 ```
--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -16,10 +16,10 @@ Ollama inference provider for running local models through the Ollama runtime.
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `str` | No | http://localhost:11434 |  |
+| `base_url` | `HttpUrl \| None` | No | http://localhost:11434/v1 |  |
 ## Sample Configuration
 ```yaml
-url: ${env.OLLAMA_URL:=http://localhost:11434}
+base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
 ```
--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -17,7 +17,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `str` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
+| `base_url` | `HttpUrl \| None` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -17,11 +17,11 @@ Passthrough inference provider for connecting to any external inference service
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No |  | The URL for the passthrough endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the passthrough endpoint |
 ## Sample Configuration
 ```yaml
-url: ${env.PASSTHROUGH_URL}
+base_url: ${env.PASSTHROUGH_URL}
 api_key: ${env.PASSTHROUGH_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -17,11 +17,11 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Runpod model serving endpoint |
 ## Sample Configuration
 ```yaml
-url: ${env.RUNPOD_URL:=}
+base_url: ${env.RUNPOD_URL:=}
 api_token: ${env.RUNPOD_API_TOKEN}
 ```
--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -17,11 +17,11 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
 ## Sample Configuration
 ```yaml
-url: https://api.sambanova.ai/v1
+base_url: https://api.sambanova.ai/v1
 api_key: ${env.SAMBANOVA_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -16,10 +16,10 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `str` | No |  | The URL for the TGI serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the TGI serving endpoint (should include /v1 path) |
 ## Sample Configuration
 ```yaml
-url: ${env.TGI_URL:=}
+base_url: ${env.TGI_URL:=}
 ```
--- a/docs/docs/providers/inference/remote_together.mdx
+++ b/docs/docs/providers/inference/remote_together.mdx
@ -17,11 +17,11 @@ Together AI inference provider for open-source models and collaborative AI devel
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
 ## Sample Configuration
 ```yaml
-url: https://api.together.xyz/v1
+base_url: https://api.together.xyz/v1
 api_key: ${env.TOGETHER_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -17,14 +17,14 @@ Remote vLLM inference provider for connecting to vLLM servers.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the vLLM model serving endpoint |
 | `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
 ## Sample Configuration
 ```yaml
-url: ${env.VLLM_URL:=}
+base_url: ${env.VLLM_URL:=}
 max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
 api_token: ${env.VLLM_API_TOKEN:=fake}
 tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -17,14 +17,14 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
+| `base_url` | `HttpUrl \| None` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
 | `project_id` | `str \| None` | No |  | The watsonx.ai project ID |
 | `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
 ## Sample Configuration
 ```yaml
-url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
+base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
 api_key: ${env.WATSONX_API_KEY:=}
 project_id: ${env.WATSONX_PROJECT_ID:=}
 ```
--- a/docs/package-lock.json
+++ b/docs/package-lock.json
@ -10712,12 +10712,6 @@
      "integrity": "sha512-QMUezzXWII9EV5aTFXW1UBVUO77wYPpjqIF8/AviUCThNeSYZykpoTixUeaNNBwmCev0AMDWMAni+f8Hxb1IFw==",
      "license": "Unlicense"
    },
    "node_modules/fs.realpath": {
      "version": "1.0.0",
      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
      "license": "ISC"
    },
    "node_modules/fsevents": {
      "version": "2.3.3",
      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@ -10821,21 +10815,20 @@
      "license": "ISC"
    },
    "node_modules/glob": {
-      "version": "7.2.3",
+      "version": "10.5.0",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz",
-      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+      "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==",
      "deprecated": "Glob versions prior to v9 are no longer supported",
      "license": "ISC",
      "dependencies": {
-        "fs.realpath": "^1.0.0",
+        "foreground-child": "^3.1.0",
-        "inflight": "^1.0.4",
+        "jackspeak": "^3.1.2",
-        "inherits": "2",
+        "minimatch": "^9.0.4",
-        "minimatch": "^3.1.1",
+        "minipass": "^7.1.2",
-        "once": "^1.3.0",
+        "package-json-from-dist": "^1.0.0",
-        "path-is-absolute": "^1.0.0"
+        "path-scurry": "^1.11.1"
      },
-      "engines": {
+      "bin": {
-        "node": "*"
+        "glob": "dist/esm/bin.mjs"
      },
      "funding": {
        "url": "https://github.com/sponsors/isaacs"
@ -10859,26 +10852,19 @@
      "integrity": "sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==",
      "license": "BSD-2-Clause"
    },
    "node_modules/glob/node_modules/brace-expansion": {
      "version": "1.1.12",
      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
      "license": "MIT",
      "dependencies": {
        "balanced-match": "^1.0.0",
        "concat-map": "0.0.1"
      }
    },
    "node_modules/glob/node_modules/minimatch": {
-      "version": "3.1.2",
+      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
      "license": "ISC",
      "dependencies": {
-        "brace-expansion": "^1.1.7"
+        "brace-expansion": "^2.0.1"
      },
      "engines": {
-        "node": "*"
+        "node": ">=16 || 14 >=14.17"
      },
      "funding": {
        "url": "https://github.com/sponsors/isaacs"
      }
    },
    "node_modules/global-dirs": {
@ -11792,17 +11778,6 @@
        "node": ">=12"
      }
    },
    "node_modules/inflight": {
      "version": "1.0.6",
      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
      "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.",
      "license": "ISC",
      "dependencies": {
        "once": "^1.3.0",
        "wrappy": "1"
      }
    },
    "node_modules/inherits": {
      "version": "2.0.4",
      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
@ -15570,15 +15545,6 @@
        "node": ">= 0.8"
      }
    },
    "node_modules/once": {
      "version": "1.4.0",
      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
      "license": "ISC",
      "dependencies": {
        "wrappy": "1"
      }
    },
    "node_modules/onetime": {
      "version": "5.1.2",
      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
@ -15955,15 +15921,6 @@
        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
      }
    },
    "node_modules/path-is-absolute": {
      "version": "1.0.1",
      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
      "license": "MIT",
      "engines": {
        "node": ">=0.10.0"
      }
    },
    "node_modules/path-is-inside": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/path-is-inside/-/path-is-inside-1.0.2.tgz",
@ -20038,41 +19995,6 @@
        "node": ">= 6"
      }
    },
    "node_modules/sucrase/node_modules/glob": {
      "version": "10.4.5",
      "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz",
      "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==",
      "license": "ISC",
      "dependencies": {
        "foreground-child": "^3.1.0",
        "jackspeak": "^3.1.2",
        "minimatch": "^9.0.4",
        "minipass": "^7.1.2",
        "package-json-from-dist": "^1.0.0",
        "path-scurry": "^1.11.1"
      },
      "bin": {
        "glob": "dist/esm/bin.mjs"
      },
      "funding": {
        "url": "https://github.com/sponsors/isaacs"
      }
    },
    "node_modules/sucrase/node_modules/minimatch": {
      "version": "9.0.5",
      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
      "license": "ISC",
      "dependencies": {
        "brace-expansion": "^2.0.1"
      },
      "engines": {
        "node": ">=16 || 14 >=14.17"
      },
      "funding": {
        "url": "https://github.com/sponsors/isaacs"
      }
    },
    "node_modules/supports-color": {
      "version": "7.2.0",
      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
@ -21620,12 +21542,6 @@
        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
      }
    },
    "node_modules/wrappy": {
      "version": "1.0.2",
      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
      "license": "ISC"
    },
    "node_modules/write-file-atomic": {
      "version": "3.0.3",
      "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-3.0.3.tgz",
--- a/docs/package.json
+++ b/docs/package.json
@ -31,6 +31,9 @@
    "react-dom": "^19.0.0",
    "remark-code-import": "^1.2.0"
  },
  "overrides": {
    "glob": "^10.5.0"
  },
  "browserslist": {
    "production": [
      ">0.5%",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -6705,9 +6705,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
            anyOf:
            - type: string
              maxLength: 512
            - type: number
            - type: boolean
            title: string | number | boolean
          propertyNames:
            type: string
            maxLength: 64
          type: object
          maxProperties: 16
          title: Attributes
          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -6061,9 +6061,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
            anyOf:
            - type: string
              maxLength: 512
            - type: number
            - type: boolean
            title: string | number | boolean
          propertyNames:
            type: string
            maxLength: 64
          type: object
          maxProperties: 16
          title: Attributes
          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -8883,9 +8883,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
            anyOf:
            - type: string
              maxLength: 512
            - type: number
            - type: boolean
            title: string | number | boolean
          propertyNames:
            type: string
            maxLength: 64
          type: object
          maxProperties: 16
          title: Attributes
          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -9862,9 +9862,21 @@ components:
          title: Object
          default: vector_store.file
        attributes:
-          additionalProperties: true
+          additionalProperties:
            anyOf:
            - type: string
              maxLength: 512
            - type: number
            - type: boolean
            title: string | number | boolean
          propertyNames:
            type: string
            maxLength: 64
          type: object
          maxProperties: 16
          title: Attributes
          description: Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters, booleans, or numbers.
          x-oaiTypeLabel: map
        chunking_strategy:
          oneOf:
          - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
--- a/scripts/docker.sh
+++ b/scripts/docker.sh
@ -287,9 +287,9 @@ start_container() {
    # On macOS/Windows, use host.docker.internal to reach host from container
    # On Linux with --network host, use localhost
    if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then
-        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434/v1}"
    else
-        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434/v1}"
    fi
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
--- a/scripts/get_setup_env.py
+++ b/scripts/get_setup_env.py
@ -16,16 +16,16 @@ import sys
 from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
-def get_setup_env_vars(setup_name, suite_name=None):
+def get_setup_config(setup_name, suite_name=None):
    """
-    Get environment variables for a setup, with optional suite default fallback.
+    Get full configuration (env vars + defaults) for a setup.
    Args:
        setup_name: Name of the setup (e.g., 'ollama', 'gpt')
        suite_name: Optional suite name to get default setup if setup_name is None
    Returns:
-        Dictionary of environment variables
+        Dictionary with 'env' and 'defaults' keys
    """
    # If no setup specified, try to get default from suite
    if not setup_name and suite_name:
@ -34,7 +34,7 @@ def get_setup_env_vars(setup_name, suite_name=None):
            setup_name = suite.default_setup
    if not setup_name:
-        return {}
+        return {"env": {}, "defaults": {}}
    setup = SETUP_DEFINITIONS.get(setup_name)
    if not setup:
@ -44,27 +44,31 @@ def get_setup_env_vars(setup_name, suite_name=None):
        )
        sys.exit(1)
-    return setup.env
+    return {"env": setup.env, "defaults": setup.defaults}
 def main():
-    parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
+    parser = argparse.ArgumentParser(description="Extract environment variables and defaults from a test setup")
    parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
    parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
    parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")
    args = parser.parse_args()
-    env_vars = get_setup_env_vars(args.setup, args.suite)
+    config = get_setup_config(args.setup, args.suite)
    if args.format == "bash":
-        # Output as bash export statements
+        # Output env vars as bash export statements
-        for key, value in env_vars.items():
+        for key, value in config["env"].items():
            print(f"export {key}='{value}'")
        # Output defaults as bash export statements with LLAMA_STACK_TEST_ prefix
        for key, value in config["defaults"].items():
            env_key = f"LLAMA_STACK_TEST_{key.upper()}"
            print(f"export {env_key}='{value}'")
    elif args.format == "json":
        import json
-        print(json.dumps(env_vars))
+        print(json.dumps(config))
 if __name__ == "__main__":
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -640,7 +640,7 @@ cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
      --network llama-net \
      -p "${PORT}:${PORT}" \
      "${server_env_opts[@]}" \
-      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \
+      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}/v1" \
      "${SERVER_IMAGE}" --port "${PORT}")
 log "🦙 Starting Llama Stack..."
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -20,6 +20,7 @@ TEST_PATTERN=""
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
 COLLECT_ONLY=false
 TYPESCRIPT_ONLY=false
 # Function to display usage
 usage() {
@ -34,6 +35,7 @@ Options:
    --subdirs STRING         Comma-separated list of test subdirectories to run (overrides suite)
    --pattern STRING         Regex pattern to pass to pytest -k
    --collect-only           Collect tests only without running them (skips server startup)
    --typescript-only        Skip Python tests and run only TypeScript client tests
    --help                   Show this help message
 Suites are defined in tests/integration/suites.py and define which tests to run.
@ -90,6 +92,10 @@ while [[ $# -gt 0 ]]; do
        COLLECT_ONLY=true
        shift
        ;;
    --typescript-only)
        TYPESCRIPT_ONLY=true
        shift
        ;;
    --help)
        usage
        exit 0
@ -181,6 +187,10 @@ echo "$SETUP_ENV"
 eval "$SETUP_ENV"
 echo ""
 # Export suite and setup names for TypeScript tests
 export LLAMA_STACK_TEST_SUITE="$TEST_SUITE"
 export LLAMA_STACK_TEST_SETUP="$TEST_SETUP"
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR
@ -212,6 +222,71 @@ find_available_port() {
    return 1
 }
 run_client_ts_tests() {
    if ! command -v npm &>/dev/null; then
        echo "npm could not be found; ensure Node.js is installed"
        return 1
    fi
    pushd tests/integration/client-typescript >/dev/null
    # Determine if TS_CLIENT_PATH is a directory path or an npm version
    if [[ -d "$TS_CLIENT_PATH" ]]; then
        # It's a directory path - use local checkout
        if [[ ! -f "$TS_CLIENT_PATH/package.json" ]]; then
            echo "Error: $TS_CLIENT_PATH exists but doesn't look like llama-stack-client-typescript (no package.json)"
            popd >/dev/null
            return 1
        fi
        echo "Using local llama-stack-client-typescript from: $TS_CLIENT_PATH"
        # Build the TypeScript client first
        echo "Building TypeScript client..."
        pushd "$TS_CLIENT_PATH" >/dev/null
        npm install --silent
        npm run build --silent
        popd >/dev/null
        # Install other dependencies first
        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
            npm ci --silent
        else
            npm install --silent
        fi
        # Then install the client from local directory
        echo "Installing llama-stack-client from: $TS_CLIENT_PATH"
        npm install "$TS_CLIENT_PATH" --silent
    else
        # It's an npm version specifier - install from npm
        echo "Installing llama-stack-client@${TS_CLIENT_PATH} from npm"
        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
            npm ci --silent
            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
        else
            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
        fi
    fi
    # Verify installation
    echo "Verifying llama-stack-client installation..."
    if npm list llama-stack-client 2>/dev/null | grep -q llama-stack-client; then
        echo "✅ llama-stack-client successfully installed"
        npm list llama-stack-client
    else
        echo "❌ llama-stack-client not found in node_modules"
        echo "Installed packages:"
        npm list --depth=0
        popd >/dev/null
        return 1
    fi
    echo "Running TypeScript tests for suite $TEST_SUITE (setup $TEST_SETUP)"
    npm test
    popd >/dev/null
 }
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
    # Find an available port for the server
@ -221,6 +296,7 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"
    stop_server() {
@ -298,6 +374,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"
    echo "=== Building Docker Image for distribution: $DISTRO ==="
@ -473,7 +550,9 @@ if [[ -n "$STACK_CONFIG" ]]; then
    STACK_CONFIG_ARG="--stack-config=$STACK_CONFIG"
 fi
-pytest -s -v $PYTEST_TARGET \
+# Run Python tests unless typescript-only mode
 if [[ "$TYPESCRIPT_ONLY" == "false" ]]; then
    pytest -s -v $PYTEST_TARGET \
        $STACK_CONFIG_ARG \
        --inference-mode="$INFERENCE_MODE" \
        -k "$PYTEST_PATTERN" \
@ -482,7 +561,12 @@ pytest -s -v $PYTEST_TARGET \
        --embedding-model=sentence-transformers/nomic-ai/nomic-embed-text-v1.5 \
        --color=yes $EXTRA_PARAMS \
        --capture=tee-sys
-exit_code=$?
+    exit_code=$?
 else
    echo "Skipping Python tests (--typescript-only mode)"
    exit_code=0
 fi
 set +x
 set -e
@ -506,5 +590,10 @@ else
    exit 1
 fi
 # Run TypeScript client tests if TS_CLIENT_PATH is set
 if [[ $exit_code -eq 0 && -n "${TS_CLIENT_PATH:-}" && "${LLAMA_STACK_TEST_STACK_CONFIG_TYPE:-}" == "server" ]]; then
    run_client_ts_tests
 fi
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -16,9 +16,8 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/nvidia/run.yaml
+++ b/src/llama_stack/distributions/nvidia/run.yaml
@ -16,9 +16,8 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
--- a/src/llama_stack/distributions/open-benchmark/run.yaml
+++ b/src/llama_stack/distributions/open-benchmark/run.yaml
@ -27,12 +27,12 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  vector_io:
  - provider_id: sqlite-vec
--- a/src/llama_stack/distributions/postgres-demo/run.yaml
+++ b/src/llama_stack/distributions/postgres-demo/run.yaml
@ -11,7 +11,7 @@ providers:
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/watsonx/run.yaml
+++ b/src/llama_stack/distributions/watsonx/run.yaml
@ -15,7 +15,7 @@ providers:
  - provider_id: watsonx
    provider_type: remote::watsonx
    config:
-      url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
+      base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
      api_key: ${env.WATSONX_API_KEY:=}
      project_id: ${env.WATSONX_PROJECT_ID:=}
  vector_io:
--- a/src/llama_stack/providers/inline/agents/meta_reference/init.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/init.py
@ -23,12 +23,14 @@ async def get_provider_impl(
        config,
        deps[Api.inference],
        deps[Api.vector_io],
-        deps[Api.safety],
+        deps.get(Api.safety),
        deps[Api.tool_runtime],
        deps[Api.tool_groups],
        deps[Api.conversations],
-        policy,
+        deps[Api.prompts],
        deps[Api.files],
        telemetry_enabled,
        policy,
    )
    await impl.initialize()
    return impl
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -12,6 +12,7 @@ from llama_stack.providers.utils.responses.responses_store import ResponsesStore
 from llama_stack_api import (
    Agents,
    Conversations,
    Files,
    Inference,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
@ -22,6 +23,7 @@ from llama_stack_api import (
    OpenAIResponsePrompt,
    OpenAIResponseText,
    Order,
    Prompts,
    ResponseGuardrail,
    Safety,
    ToolGroups,
@ -41,10 +43,12 @@ class MetaReferenceAgentsImpl(Agents):
        config: MetaReferenceAgentsImplConfig,
        inference_api: Inference,
        vector_io_api: VectorIO,
-        safety_api: Safety,
+        safety_api: Safety | None,
        tool_runtime_api: ToolRuntime,
        tool_groups_api: ToolGroups,
        conversations_api: Conversations,
        prompts_api: Prompts,
        files_api: Files,
        policy: list[AccessRule],
        telemetry_enabled: bool = False,
    ):
@ -56,7 +60,8 @@ class MetaReferenceAgentsImpl(Agents):
        self.tool_groups_api = tool_groups_api
        self.conversations_api = conversations_api
        self.telemetry_enabled = telemetry_enabled
-
+        self.prompts_api = prompts_api
        self.files_api = files_api
        self.in_memory_store = InmemoryKVStoreImpl()
        self.openai_responses_impl: OpenAIResponsesImpl | None = None
        self.policy = policy
@ -73,6 +78,8 @@ class MetaReferenceAgentsImpl(Agents):
            vector_io_api=self.vector_io_api,
            safety_api=self.safety_api,
            conversations_api=self.conversations_api,
            prompts_api=self.prompts_api,
            files_api=self.files_api,
        )
    async def shutdown(self) -> None:
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import re
 import time
 import uuid
 from collections.abc import AsyncIterator
@ -18,13 +19,17 @@ from llama_stack.providers.utils.responses.responses_store import (
 from llama_stack_api import (
    ConversationItem,
    Conversations,
    Files,
    Inference,
    InvalidConversationIdError,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIChatCompletionContentPartParam,
    OpenAIDeleteResponseObject,
    OpenAIMessageParam,
    OpenAIResponseInput,
    OpenAIResponseInputMessageContentFile,
    OpenAIResponseInputMessageContentImage,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
    OpenAIResponseMessage,
@ -34,7 +39,9 @@ from llama_stack_api import (
    OpenAIResponseText,
    OpenAIResponseTextFormat,
    OpenAISystemMessageParam,
    OpenAIUserMessageParam,
    Order,
    Prompts,
    ResponseGuardrailSpec,
    Safety,
    ToolGroups,
@ -46,6 +53,7 @@ from .streaming import StreamingResponseOrchestrator
 from .tool_executor import ToolExecutor
 from .types import ChatCompletionContext, ToolContext
 from .utils import (
    convert_response_content_to_chat_content,
    convert_response_input_to_chat_messages,
    convert_response_text_to_chat_response_format,
    extract_guardrail_ids,
@ -67,8 +75,10 @@ class OpenAIResponsesImpl:
        tool_runtime_api: ToolRuntime,
        responses_store: ResponsesStore,
        vector_io_api: VectorIO,  # VectorIO
-        safety_api: Safety,
+        safety_api: Safety | None,
        conversations_api: Conversations,
        prompts_api: Prompts,
        files_api: Files,
    ):
        self.inference_api = inference_api
        self.tool_groups_api = tool_groups_api
@ -82,6 +92,8 @@ class OpenAIResponsesImpl:
            tool_runtime_api=tool_runtime_api,
            vector_io_api=vector_io_api,
        )
        self.prompts_api = prompts_api
        self.files_api = files_api
    async def _prepend_previous_response(
        self,
@ -122,11 +134,13 @@ class OpenAIResponsesImpl:
                # Use stored messages directly and convert only new input
                message_adapter = TypeAdapter(list[OpenAIMessageParam])
                messages = message_adapter.validate_python(previous_response.messages)
-                new_messages = await convert_response_input_to_chat_messages(input, previous_messages=messages)
+                new_messages = await convert_response_input_to_chat_messages(
                    input, previous_messages=messages, files_api=self.files_api
                )
                messages.extend(new_messages)
            else:
                # Backward compatibility: reconstruct from inputs
-                messages = await convert_response_input_to_chat_messages(all_input)
+                messages = await convert_response_input_to_chat_messages(all_input, files_api=self.files_api)
            tool_context.recover_tools_from_previous_response(previous_response)
        elif conversation is not None:
@ -138,7 +152,7 @@ class OpenAIResponsesImpl:
            all_input = input
            if not conversation_items.data:
                # First turn - just convert the new input
-                messages = await convert_response_input_to_chat_messages(input)
+                messages = await convert_response_input_to_chat_messages(input, files_api=self.files_api)
            else:
                if not stored_messages:
                    all_input = conversation_items.data
@ -154,14 +168,82 @@ class OpenAIResponsesImpl:
                    all_input = input
                messages = stored_messages or []
-                new_messages = await convert_response_input_to_chat_messages(all_input, previous_messages=messages)
+                new_messages = await convert_response_input_to_chat_messages(
                    all_input, previous_messages=messages, files_api=self.files_api
                )
                messages.extend(new_messages)
        else:
            all_input = input
-            messages = await convert_response_input_to_chat_messages(all_input)
+            messages = await convert_response_input_to_chat_messages(all_input, files_api=self.files_api)
        return all_input, messages, tool_context
    async def _prepend_prompt(
        self,
        messages: list[OpenAIMessageParam],
        openai_response_prompt: OpenAIResponsePrompt | None,
    ) -> None:
        """Prepend prompt template to messages, resolving text/image/file variables.
        :param messages: List of OpenAIMessageParam objects
        :param openai_response_prompt: (Optional) OpenAIResponsePrompt object with variables
        :returns: string of utf-8 characters
        """
        if not openai_response_prompt or not openai_response_prompt.id:
            return
        prompt_version = int(openai_response_prompt.version) if openai_response_prompt.version else None
        cur_prompt = await self.prompts_api.get_prompt(openai_response_prompt.id, prompt_version)
        if not cur_prompt or not cur_prompt.prompt:
            return
        cur_prompt_text = cur_prompt.prompt
        cur_prompt_variables = cur_prompt.variables
        if not openai_response_prompt.variables:
            messages.insert(0, OpenAISystemMessageParam(content=cur_prompt_text))
            return
        # Validate that all provided variables exist in the prompt
        for name in openai_response_prompt.variables.keys():
            if name not in cur_prompt_variables:
                raise ValueError(f"Variable {name} not found in prompt {openai_response_prompt.id}")
        # Separate text and media variables
        text_substitutions = {}
        media_content_parts: list[OpenAIChatCompletionContentPartParam] = []
        for name, value in openai_response_prompt.variables.items():
            # Text variable found
            if isinstance(value, OpenAIResponseInputMessageContentText):
                text_substitutions[name] = value.text
            # Media variable found
            elif isinstance(value, OpenAIResponseInputMessageContentImage | OpenAIResponseInputMessageContentFile):
                converted_parts = await convert_response_content_to_chat_content([value], files_api=self.files_api)
                if isinstance(converted_parts, list):
                    media_content_parts.extend(converted_parts)
                # Eg: {{product_photo}} becomes "[Image: product_photo]"
                # This gives the model textual context about what media exists in the prompt
                var_type = value.type.replace("input_", "").replace("_", " ").title()
                text_substitutions[name] = f"[{var_type}: {name}]"
        def replace_variable(match: re.Match[str]) -> str:
            var_name = match.group(1).strip()
            return str(text_substitutions.get(var_name, match.group(0)))
        pattern = r"\{\{\s*(\w+)\s*\}\}"
        processed_prompt_text = re.sub(pattern, replace_variable, cur_prompt_text)
        # Insert system message with resolved text
        messages.insert(0, OpenAISystemMessageParam(content=processed_prompt_text))
        # If we have media, create a new user message because allows to ingest images and files
        if media_content_parts:
            messages.append(OpenAIUserMessageParam(content=media_content_parts))
    async def get_openai_response(
        self,
        response_id: str,
@ -273,6 +355,14 @@ class OpenAIResponsesImpl:
        guardrail_ids = extract_guardrail_ids(guardrails) if guardrails else []
        # Validate that Safety API is available if guardrails are requested
        if guardrail_ids and self.safety_api is None:
            raise ValueError(
                "Cannot process guardrails: Safety API is not configured.\n\n"
                "To use guardrails, ensure the Safety API is configured in your stack, or remove "
                "the 'guardrails' parameter from your request."
            )
        if conversation is not None:
            if previous_response_id is not None:
                raise ValueError(
@ -289,6 +379,7 @@ class OpenAIResponsesImpl:
            input=input,
            conversation=conversation,
            model=model,
            prompt=prompt,
            instructions=instructions,
            previous_response_id=previous_response_id,
            store=store,
@ -342,6 +433,7 @@ class OpenAIResponsesImpl:
        instructions: str | None = None,
        previous_response_id: str | None = None,
        conversation: str | None = None,
        prompt: OpenAIResponsePrompt | None = None,
        store: bool | None = True,
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
@ -364,6 +456,9 @@ class OpenAIResponsesImpl:
        if instructions:
            messages.insert(0, OpenAISystemMessageParam(content=instructions))
        # Prepend reusable prompt (if provided)
        await self._prepend_prompt(messages, prompt)
        # Structured outputs
        response_format = await convert_response_text_to_chat_response_format(text)
@ -386,6 +481,7 @@ class OpenAIResponsesImpl:
            ctx=ctx,
            response_id=response_id,
            created_at=created_at,
            prompt=prompt,
            text=text,
            max_infer_iters=max_infer_iters,
            parallel_tool_calls=parallel_tool_calls,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -66,6 +66,8 @@ from llama_stack_api import (
    OpenAIResponseUsage,
    OpenAIResponseUsageInputTokensDetails,
    OpenAIResponseUsageOutputTokensDetails,
    OpenAIToolMessageParam,
    Safety,
    WebSearchToolTypes,
 )
@ -111,7 +113,7 @@ class StreamingResponseOrchestrator:
        max_infer_iters: int,
        tool_executor,  # Will be the tool execution logic from the main class
        instructions: str | None,
-        safety_api,
+        safety_api: Safety | None,
        guardrail_ids: list[str] | None = None,
        prompt: OpenAIResponsePrompt | None = None,
        parallel_tool_calls: bool | None = None,
@ -905,10 +907,16 @@ class StreamingResponseOrchestrator:
        """Coordinate execution of both function and non-function tool calls."""
        # Execute non-function tool calls
        for tool_call in non_function_tool_calls:
-            # Check if total calls made to built-in and mcp tools exceed max_tool_calls
+            # if total calls made to built-in and mcp tools exceed max_tool_calls
            # then create a tool response message indicating the call was skipped
            if self.max_tool_calls is not None and self.accumulated_builtin_tool_calls >= self.max_tool_calls:
                logger.info(f"Ignoring built-in and mcp tool call since reached the limit of {self.max_tool_calls=}.")
-                break
+                skipped_call_message = OpenAIToolMessageParam(
                    content=f"Tool call skipped: maximum tool calls limit ({self.max_tool_calls}) reached.",
                    tool_call_id=tool_call.id,
                )
                next_turn_messages.append(skipped_call_message)
                continue
            # Find the item_id for this tool call
            matching_item_id = None
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@ -5,11 +5,14 @@
 # the root directory of this source tree.
 import asyncio
 import base64
 import mimetypes
 import re
 import uuid
 from collections.abc import Sequence
 from llama_stack_api import (
    Files,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletionContentPartImageParam,
    OpenAIChatCompletionContentPartParam,
@ -18,6 +21,8 @@ from llama_stack_api import (
    OpenAIChatCompletionToolCallFunction,
    OpenAIChoice,
    OpenAIDeveloperMessageParam,
    OpenAIFile,
    OpenAIFileFile,
    OpenAIImageURL,
    OpenAIJSONSchema,
    OpenAIMessageParam,
@ -29,6 +34,7 @@ from llama_stack_api import (
    OpenAIResponseInput,
    OpenAIResponseInputFunctionToolCallOutput,
    OpenAIResponseInputMessageContent,
    OpenAIResponseInputMessageContentFile,
    OpenAIResponseInputMessageContentImage,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
@ -37,9 +43,11 @@ from llama_stack_api import (
    OpenAIResponseMessage,
    OpenAIResponseOutputMessageContent,
    OpenAIResponseOutputMessageContentOutputText,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFunctionToolCall,
    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
    OpenAIResponseText,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
@ -49,6 +57,46 @@ from llama_stack_api import (
 )
 async def extract_bytes_from_file(file_id: str, files_api: Files) -> bytes:
    """
    Extract raw bytes from file using the Files API.
    :param file_id: The file identifier (e.g., "file-abc123")
    :param files_api: Files API instance
    :returns: Raw file content as bytes
    :raises: ValueError if file cannot be retrieved
    """
    try:
        response = await files_api.openai_retrieve_file_content(file_id)
        return bytes(response.body)
    except Exception as e:
        raise ValueError(f"Failed to retrieve file content for file_id '{file_id}': {str(e)}") from e
 def generate_base64_ascii_text_from_bytes(raw_bytes: bytes) -> str:
    """
    Converts raw binary bytes into a safe ASCII text representation for URLs
    :param raw_bytes: the actual bytes that represents file content
    :returns: string of utf-8 characters
    """
    return base64.b64encode(raw_bytes).decode("utf-8")
 def construct_data_url(ascii_text: str, mime_type: str | None) -> str:
    """
    Construct data url with decoded data inside
    :param ascii_text: ASCII content
    :param mime_type: MIME type of file
    :returns: data url string (eg. data:image/png,base64,%3Ch1%3EHello%2C%20World%21%3C%2Fh1%3E)
    """
    if not mime_type:
        mime_type = "application/octet-stream"
    return f"data:{mime_type};base64,{ascii_text}"
 async def convert_chat_choice_to_response_message(
    choice: OpenAIChoice,
    citation_files: dict[str, str] | None = None,
@ -78,11 +126,15 @@ async def convert_chat_choice_to_response_message(
 async def convert_response_content_to_chat_content(
    content: str | Sequence[OpenAIResponseInputMessageContent | OpenAIResponseOutputMessageContent],
    files_api: Files | None,
 ) -> str | list[OpenAIChatCompletionContentPartParam]:
    """
    Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts.
    The content schemas of each API look similar, but are not exactly the same.
    :param content: The content to convert
    :param files_api: Files API for resolving file_id to raw file content (required if content contains files/images)
    """
    if isinstance(content, str):
        return content
@ -95,9 +147,68 @@ async def convert_response_content_to_chat_content(
        elif isinstance(content_part, OpenAIResponseOutputMessageContentOutputText):
            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
        elif isinstance(content_part, OpenAIResponseInputMessageContentImage):
            detail = content_part.detail
            image_mime_type = None
            if content_part.image_url:
-                image_url = OpenAIImageURL(url=content_part.image_url, detail=content_part.detail)
+                image_url = OpenAIImageURL(url=content_part.image_url, detail=detail)
                converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
            elif content_part.file_id:
                if files_api is None:
                    raise ValueError("file_ids are not supported by this implementation of the Stack")
                image_file_response = await files_api.openai_retrieve_file(content_part.file_id)
                if image_file_response.filename:
                    image_mime_type, _ = mimetypes.guess_type(image_file_response.filename)
                raw_image_bytes = await extract_bytes_from_file(content_part.file_id, files_api)
                ascii_text = generate_base64_ascii_text_from_bytes(raw_image_bytes)
                image_data_url = construct_data_url(ascii_text, image_mime_type)
                image_url = OpenAIImageURL(url=image_data_url, detail=detail)
                converted_parts.append(OpenAIChatCompletionContentPartImageParam(image_url=image_url))
            else:
                raise ValueError(
                    f"Image content must have either 'image_url' or 'file_id'. "
                    f"Got image_url={content_part.image_url}, file_id={content_part.file_id}"
                )
        elif isinstance(content_part, OpenAIResponseInputMessageContentFile):
            resolved_file_data = None
            file_data = content_part.file_data
            file_id = content_part.file_id
            file_url = content_part.file_url
            filename = content_part.filename
            file_mime_type = None
            if not any([file_data, file_id, file_url]):
                raise ValueError(
                    f"File content must have at least one of 'file_data', 'file_id', or 'file_url'. "
                    f"Got file_data={file_data}, file_id={file_id}, file_url={file_url}"
                )
            if file_id:
                if files_api is None:
                    raise ValueError("file_ids are not supported by this implementation of the Stack")
                file_response = await files_api.openai_retrieve_file(file_id)
                if not filename:
                    filename = file_response.filename
                file_mime_type, _ = mimetypes.guess_type(file_response.filename)
                raw_file_bytes = await extract_bytes_from_file(file_id, files_api)
                ascii_text = generate_base64_ascii_text_from_bytes(raw_file_bytes)
                resolved_file_data = construct_data_url(ascii_text, file_mime_type)
            elif file_data:
                if file_data.startswith("data:"):
                    resolved_file_data = file_data
                else:
                    # Raw base64 data, wrap in data URL format
                    if filename:
                        file_mime_type, _ = mimetypes.guess_type(filename)
                    resolved_file_data = construct_data_url(file_data, file_mime_type)
            elif file_url:
                resolved_file_data = file_url
            converted_parts.append(
                OpenAIFile(
                    file=OpenAIFileFile(
                        file_data=resolved_file_data,
                        filename=filename,
                    )
                )
            )
        elif isinstance(content_part, str):
            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part))
        else:
@ -110,12 +221,14 @@ async def convert_response_content_to_chat_content(
 async def convert_response_input_to_chat_messages(
    input: str | list[OpenAIResponseInput],
    previous_messages: list[OpenAIMessageParam] | None = None,
    files_api: Files | None = None,
 ) -> list[OpenAIMessageParam]:
    """
    Convert the input from an OpenAI Response API request into OpenAI Chat Completion messages.
    :param input: The input to convert
    :param previous_messages: Optional previous messages to check for function_call references
    :param files_api: Files API for resolving file_id to raw file content (optional, required for file/image content)
    """
    messages: list[OpenAIMessageParam] = []
    if isinstance(input, list):
@ -169,6 +282,12 @@ async def convert_response_input_to_chat_messages(
            elif isinstance(input_item, OpenAIResponseOutputMessageMCPListTools):
                # the tool list will be handled separately
                pass
            elif isinstance(
                input_item,
                OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageFileSearchToolCall,
            ):
                # these tool calls are tracked internally but not converted to chat messages
                pass
            elif isinstance(input_item, OpenAIResponseMCPApprovalRequest) or isinstance(
                input_item, OpenAIResponseMCPApprovalResponse
            ):
@ -176,7 +295,7 @@ async def convert_response_input_to_chat_messages(
                pass
            elif isinstance(input_item, OpenAIResponseMessage):
                # Narrow type to OpenAIResponseMessage which has content and role attributes
-                content = await convert_response_content_to_chat_content(input_item.content)
+                content = await convert_response_content_to_chat_content(input_item.content, files_api)
                message_type = await get_message_type_by_role(input_item.role)
                if message_type is None:
                    raise ValueError(
@ -320,11 +439,15 @@ def is_function_tool_call(
    return False
-async def run_guardrails(safety_api: Safety, messages: str, guardrail_ids: list[str]) -> str | None:
+async def run_guardrails(safety_api: Safety | None, messages: str, guardrail_ids: list[str]) -> str | None:
    """Run guardrails against messages and return violation message if blocked."""
    if not messages:
        return None
    # If safety API is not available, skip guardrails
    if safety_api is None:
        return None
    # Look up shields to get their provider_resource_id (actual model ID)
    model_ids = []
    # TODO: list_shields not in Safety interface but available at runtime via API routing
--- a/src/llama_stack/providers/registry/agents.py
+++ b/src/llama_stack/providers/registry/agents.py
@ -30,11 +30,15 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
            api_dependencies=[
                Api.inference,
                Api.safety,
                Api.vector_io,
                Api.tool_runtime,
                Api.tool_groups,
                Api.conversations,
                Api.prompts,
                Api.files,
            ],
            optional_api_dependencies=[
                Api.safety,
            ],
            description="Meta's reference implementation of an agent system that can use tools, access vector databases, and perform complex reasoning tasks.",
        ),
--- a/src/llama_stack/providers/remote/inference/azure/azure.py
+++ b/src/llama_stack/providers/remote/inference/azure/azure.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from urllib.parse import urljoin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import AzureConfig
@ -22,4 +20,4 @@ class AzureInferenceAdapter(OpenAIMixin):
        Returns the Azure API base URL from the configuration.
        """
-        return urljoin(str(self.config.api_base), "/openai/v1")
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/azure/config.py
+++ b/src/llama_stack/providers/remote/inference/azure/config.py
@ -32,8 +32,9 @@ class AzureProviderDataValidator(BaseModel):
@json_schema_type
 class AzureConfig(RemoteInferenceProviderConfig):
-    api_base: HttpUrl = Field(
+    base_url: HttpUrl | None = Field(
-        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
+        default=None,
        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1)",
    )
    api_version: str | None = Field(
        default_factory=lambda: os.getenv("AZURE_API_VERSION"),
@ -48,14 +49,14 @@ class AzureConfig(RemoteInferenceProviderConfig):
    def sample_run_config(
        cls,
        api_key: str = "${env.AZURE_API_KEY:=}",
-        api_base: str = "${env.AZURE_API_BASE:=}",
+        base_url: str = "${env.AZURE_API_BASE:=}",
        api_version: str = "${env.AZURE_API_VERSION:=}",
        api_type: str = "${env.AZURE_API_TYPE:=}",
        **kwargs,
    ) -> dict[str, Any]:
        return {
            "api_key": api_key,
-            "api_base": api_base,
+            "base_url": base_url,
            "api_version": api_version,
            "api_type": api_type,
        }
--- a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from urllib.parse import urljoin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack_api import (
    OpenAIEmbeddingsRequestWithExtraBody,
@ -21,7 +19,7 @@ class CerebrasInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "cerebras_api_key"
    def get_base_url(self) -> str:
-        return urljoin(self.config.base_url, "v1")
+        return str(self.config.base_url)
    async def openai_embeddings(
        self,
--- a/src/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/config.py
@ -7,12 +7,12 @@
 import os
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
-DEFAULT_BASE_URL = "https://api.cerebras.ai"
+DEFAULT_BASE_URL = "https://api.cerebras.ai/v1"
 class CerebrasProviderDataValidator(BaseModel):
@ -24,8 +24,8 @@ class CerebrasProviderDataValidator(BaseModel):
@json_schema_type
 class CerebrasImplConfig(RemoteInferenceProviderConfig):
-    base_url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL),
+        default=HttpUrl(os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL)),
        description="Base URL for the Cerebras API",
    )
--- a/src/llama_stack/providers/remote/inference/databricks/config.py
+++ b/src/llama_stack/providers/remote/inference/databricks/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,9 +21,9 @@ class DatabricksProviderDataValidator(BaseModel):
@json_schema_type
 class DatabricksImplConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
-        description="The URL for the Databricks model serving endpoint",
+        description="The URL for the Databricks model serving endpoint (should include /serving-endpoints path)",
    )
    auth_credential: SecretStr | None = Field(
        default=None,
@ -34,11 +34,11 @@ class DatabricksImplConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.DATABRICKS_HOST:=}",
+        base_url: str = "${env.DATABRICKS_HOST:=}",
        api_token: str = "${env.DATABRICKS_TOKEN:=}",
        **kwargs: Any,
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_token": api_token,
        }
--- a/src/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py
@ -29,15 +29,21 @@ class DatabricksInferenceAdapter(OpenAIMixin):
    }
    def get_base_url(self) -> str:
-        return f"{self.config.url}/serving-endpoints"
+        return str(self.config.base_url)
    async def list_provider_model_ids(self) -> Iterable[str]:
        # Filter out None values from endpoint names
        api_token = self._get_api_key_from_config_or_provider_data()
        # WorkspaceClient expects base host without /serving-endpoints suffix
        base_url_str = str(self.config.base_url)
        if base_url_str.endswith("/serving-endpoints"):
            host = base_url_str[:-18]  # Remove '/serving-endpoints'
        else:
            host = base_url_str
        return [
            endpoint.name  # type: ignore[misc]
            for endpoint in WorkspaceClient(
-                host=self.config.url, token=api_token
+                host=host, token=api_token
            ).serving_endpoints.list()  # TODO: this is not async
        ]
--- a/src/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/src/llama_stack/providers/remote/inference/fireworks/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import Field
+from pydantic import Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type
@json_schema_type
 class FireworksImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.fireworks.ai/inference/v1",
+        default=HttpUrl("https://api.fireworks.ai/inference/v1"),
        description="The URL for the Fireworks server",
    )
    @classmethod
    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.fireworks.ai/inference/v1",
+            "base_url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -24,4 +24,4 @@ class FireworksInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "fireworks_api_key"
    def get_base_url(self) -> str:
-        return "https://api.fireworks.ai/inference/v1"
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/groq/config.py
+++ b/src/llama_stack/providers/remote/inference/groq/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class GroqProviderDataValidator(BaseModel):
@json_schema_type
 class GroqConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.groq.com",
+        default=HttpUrl("https://api.groq.com/openai/v1"),
        description="The URL for the Groq AI server",
    )
    @classmethod
    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.groq.com",
+            "base_url": "https://api.groq.com/openai/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/groq/groq.py
+++ b/src/llama_stack/providers/remote/inference/groq/groq.py
@ -15,4 +15,4 @@ class GroqInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "groq_api_key"
    def get_base_url(self) -> str:
-        return f"{self.config.url}/openai/v1"
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class LlamaProviderDataValidator(BaseModel):
@json_schema_type
 class LlamaCompatConfig(RemoteInferenceProviderConfig):
-    openai_compat_api_base: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.llama.com/compat/v1/",
+        default=HttpUrl("https://api.llama.com/compat/v1/"),
        description="The URL for the Llama API server",
    )
    @classmethod
    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
-            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
+            "base_url": "https://api.llama.com/compat/v1/",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -31,7 +31,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin):
        :return: The Llama API base URL
        """
-        return self.config.openai_compat_api_base
+        return str(self.config.base_url)
    async def openai_completion(
        self,
--- a/src/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/config.py
@ -7,7 +7,7 @@
 import os
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -44,18 +44,14 @@ class NVIDIAConfig(RemoteInferenceProviderConfig):
    URL of your running NVIDIA NIM and do not need to set the api_key.
    """
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"),
+        default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1"),
        description="A base url for accessing the NVIDIA NIM",
    )
    timeout: int = Field(
        default=60,
        description="Timeout for the HTTP requests",
    )
    append_api_version: bool = Field(
        default_factory=lambda: os.getenv("NVIDIA_APPEND_API_VERSION", "True").lower() != "false",
        description="When set to false, the API version will not be appended to the base_url. By default, it is true.",
    )
    rerank_model_to_url: dict[str, str] = Field(
        default_factory=lambda: {
            "nv-rerank-qa-mistral-4b:1": "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking",
@ -68,13 +64,11 @@ class NVIDIAConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}",
+        base_url: HttpUrl | None = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}",
        api_key: str = "${env.NVIDIA_API_KEY:=}",
        append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}",
        **kwargs,
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_key": api_key,
            "append_api_version": append_api_version,
        }
--- a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -44,7 +44,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
    }
    async def initialize(self) -> None:
-        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")
+        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.base_url})...")
        if _is_nvidia_hosted(self.config):
            if not self.config.auth_credential:
@ -72,7 +72,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
        :return: The NVIDIA API base URL
        """
-        return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url
+        return str(self.config.base_url)
    async def list_provider_model_ids(self) -> Iterable[str]:
        """
--- a/src/llama_stack/providers/remote/inference/nvidia/utils.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/utils.py
@ -8,4 +8,4 @@ from . import NVIDIAConfig
 def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:
-    return "integrate.api.nvidia.com" in config.url
+    return "integrate.api.nvidia.com" in str(config.base_url)
--- a/src/llama_stack/providers/remote/inference/ollama/config.py
+++ b/src/llama_stack/providers/remote/inference/ollama/config.py
@ -6,20 +6,22 @@
 from typing import Any
-from pydantic import Field, SecretStr
+from pydantic import Field, HttpUrl, SecretStr
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-DEFAULT_OLLAMA_URL = "http://localhost:11434"
+DEFAULT_OLLAMA_URL = "http://localhost:11434/v1"
 class OllamaImplConfig(RemoteInferenceProviderConfig):
    auth_credential: SecretStr | None = Field(default=None, exclude=True)
-    url: str = DEFAULT_OLLAMA_URL
+    base_url: HttpUrl | None = Field(default=HttpUrl(DEFAULT_OLLAMA_URL))
    @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(
        cls, base_url: str = "${env.OLLAMA_URL:=http://localhost:11434/v1}", **kwargs
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
        }
--- a/src/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/src/llama_stack/providers/remote/inference/ollama/ollama.py
@ -55,17 +55,23 @@ class OllamaInferenceAdapter(OpenAIMixin):
        # ollama client attaches itself to the current event loop (sadly?)
        loop = asyncio.get_running_loop()
        if loop not in self._clients:
-            self._clients[loop] = AsyncOllamaClient(host=self.config.url)
+            # Ollama client expects base URL without /v1 suffix
            base_url_str = str(self.config.base_url)
            if base_url_str.endswith("/v1"):
                host = base_url_str[:-3]
            else:
                host = base_url_str
            self._clients[loop] = AsyncOllamaClient(host=host)
        return self._clients[loop]
    def get_api_key(self):
        return "NO KEY REQUIRED"
    def get_base_url(self):
-        return self.config.url.rstrip("/") + "/v1"
+        return str(self.config.base_url)
    async def initialize(self) -> None:
-        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
+        logger.info(f"checking connectivity to Ollama at `{self.config.base_url}`...")
        r = await self.health()
        if r["status"] == HealthStatus.ERROR:
            logger.warning(
--- a/src/llama_stack/providers/remote/inference/openai/config.py
+++ b/src/llama_stack/providers/remote/inference/openai/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,8 +21,8 @@ class OpenAIProviderDataValidator(BaseModel):
@json_schema_type
 class OpenAIConfig(RemoteInferenceProviderConfig):
-    base_url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.openai.com/v1",
+        default=HttpUrl("https://api.openai.com/v1"),
        description="Base URL for OpenAI API",
    )
--- a/src/llama_stack/providers/remote/inference/openai/openai.py
+++ b/src/llama_stack/providers/remote/inference/openai/openai.py
@ -35,4 +35,4 @@ class OpenAIInferenceAdapter(OpenAIMixin):
        Returns the OpenAI API base URL from the configuration.
        """
-        return self.config.base_url
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import Field
+from pydantic import Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,16 +14,16 @@ from llama_stack_api import json_schema_type
@json_schema_type
 class PassthroughImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the passthrough endpoint",
    )
    @classmethod
    def sample_run_config(
-        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
+        cls, base_url: HttpUrl | None = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -82,8 +82,8 @@ class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference):
    def _get_passthrough_url(self) -> str:
        """Get the passthrough URL from config or provider data."""
-        if self.config.url is not None:
+        if self.config.base_url is not None:
-            return self.config.url
+            return str(self.config.base_url)
        provider_data = self.get_request_provider_data()
        if provider_data is None:
--- a/src/llama_stack/providers/remote/inference/runpod/config.py
+++ b/src/llama_stack/providers/remote/inference/runpod/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,7 +21,7 @@ class RunpodProviderDataValidator(BaseModel):
@json_schema_type
 class RunpodImplConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the Runpod model serving endpoint",
    )
@ -34,6 +34,6 @@ class RunpodImplConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
        return {
-            "url": "${env.RUNPOD_URL:=}",
+            "base_url": "${env.RUNPOD_URL:=}",
            "api_token": "${env.RUNPOD_API_TOKEN}",
        }
--- a/src/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/src/llama_stack/providers/remote/inference/runpod/runpod.py
@ -28,7 +28,7 @@ class RunpodInferenceAdapter(OpenAIMixin):
    def get_base_url(self) -> str:
        """Get base URL for OpenAI client."""
-        return self.config.url
+        return str(self.config.base_url)
    async def openai_chat_completion(
        self,
--- a/src/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/src/llama_stack/providers/remote/inference/sambanova/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class SambaNovaProviderDataValidator(BaseModel):
@json_schema_type
 class SambaNovaImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.sambanova.ai/v1",
+        default=HttpUrl("https://api.sambanova.ai/v1"),
        description="The URL for the SambaNova AI server",
    )
    @classmethod
    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.sambanova.ai/v1",
+            "base_url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -25,4 +25,4 @@ class SambaNovaInferenceAdapter(OpenAIMixin):
        :return: The SambaNova base URL
        """
-        return self.config.url
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/tgi/config.py
+++ b/src/llama_stack/providers/remote/inference/tgi/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -15,18 +15,19 @@ from llama_stack_api import json_schema_type
 class TGIImplConfig(RemoteInferenceProviderConfig):
    auth_credential: SecretStr | None = Field(default=None, exclude=True)
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        description="The URL for the TGI serving endpoint",
+        default=None,
        description="The URL for the TGI serving endpoint (should include /v1 path)",
    )
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.TGI_URL:=}",
+        base_url: str = "${env.TGI_URL:=}",
        **kwargs,
    ):
        return {
-            "url": url,
+            "base_url": base_url,
        }
--- a/src/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/src/llama_stack/providers/remote/inference/tgi/tgi.py
@ -8,7 +8,7 @@
 from collections.abc import Iterable
 from huggingface_hub import AsyncInferenceClient, HfApi
-from pydantic import SecretStr
+from pydantic import HttpUrl, SecretStr
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -23,7 +23,7 @@ log = get_logger(name=__name__, category="inference::tgi")
 class _HfAdapter(OpenAIMixin):
-    url: str
+    base_url: HttpUrl
    api_key: SecretStr
    hf_client: AsyncInferenceClient
@ -36,7 +36,7 @@ class _HfAdapter(OpenAIMixin):
        return "NO KEY REQUIRED"
    def get_base_url(self):
-        return self.url
+        return self.base_url
    async def list_provider_model_ids(self) -> Iterable[str]:
        return [self.model_id]
@ -50,14 +50,20 @@ class _HfAdapter(OpenAIMixin):
 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
-        if not config.url:
+        if not config.base_url:
            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
-        log.info(f"Initializing TGI client with url={config.url}")
+        log.info(f"Initializing TGI client with url={config.base_url}")
-        self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference")
+        # Extract base URL without /v1 for HF client initialization
        base_url_str = str(config.base_url).rstrip("/")
        if base_url_str.endswith("/v1"):
            base_url_for_client = base_url_str[:-3]
        else:
            base_url_for_client = base_url_str
        self.hf_client = AsyncInferenceClient(model=base_url_for_client, provider="hf-inference")
        endpoint_info = await self.hf_client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]
-        self.url = f"{config.url.rstrip('/')}/v1"
+        self.base_url = config.base_url
        self.api_key = SecretStr("NO_KEY")
--- a/src/llama_stack/providers/remote/inference/together/config.py
+++ b/src/llama_stack/providers/remote/inference/together/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import Field
+from pydantic import Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type
@json_schema_type
 class TogetherImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
-        default="https://api.together.xyz/v1",
+        default=HttpUrl("https://api.together.xyz/v1"),
        description="The URL for the Together AI server",
    )
    @classmethod
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.together.xyz/v1",
+            "base_url": "https://api.together.xyz/v1",
            "api_key": "${env.TOGETHER_API_KEY:=}",
        }
--- a/src/llama_stack/providers/remote/inference/together/together.py
+++ b/src/llama_stack/providers/remote/inference/together/together.py
@ -9,7 +9,6 @@ from collections.abc import Iterable
 from typing import Any, cast
 from together import AsyncTogether  # type: ignore[import-untyped]
 from together.constants import BASE_URL  # type: ignore[import-untyped]
 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
@ -42,7 +41,7 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
    provider_data_api_key_field: str = "together_api_key"
    def get_base_url(self):
-        return BASE_URL
+        return str(self.config.base_url)
    def _get_client(self) -> AsyncTogether:
        together_api_key = None
--- a/src/llama_stack/providers/remote/inference/vllm/config.py
+++ b/src/llama_stack/providers/remote/inference/vllm/config.py
@ -6,7 +6,7 @@
 from pathlib import Path
-from pydantic import Field, SecretStr, field_validator
+from pydantic import Field, HttpUrl, SecretStr, field_validator
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,7 +14,7 @@ from llama_stack_api import json_schema_type
@json_schema_type
 class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the vLLM model serving endpoint",
    )
@ -48,11 +48,11 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:=}",
+        base_url: str = "${env.VLLM_URL:=}",
        **kwargs,
    ):
        return {
-            "url": url,
+            "base_url": base_url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:=4096}",
            "api_token": "${env.VLLM_API_TOKEN:=fake}",
            "tls_verify": "${env.VLLM_TLS_VERIFY:=true}",
--- a/src/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/src/llama_stack/providers/remote/inference/vllm/vllm.py
@ -39,12 +39,12 @@ class VLLMInferenceAdapter(OpenAIMixin):
    def get_base_url(self) -> str:
        """Get the base URL from config."""
-        if not self.config.url:
+        if not self.config.base_url:
            raise ValueError("No base URL configured")
-        return self.config.url
+        return str(self.config.base_url)
    async def initialize(self) -> None:
-        if not self.config.url:
+        if not self.config.base_url:
            raise ValueError(
                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
            )
--- a/src/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/config.py
@ -7,7 +7,7 @@
 import os
 from typing import Any
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -23,7 +23,7 @@ class WatsonXProviderDataValidator(BaseModel):
@json_schema_type
 class WatsonXConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
        default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
        description="A base url for accessing the watsonx.ai",
    )
@ -39,7 +39,7 @@ class WatsonXConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
-            "url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
+            "base_url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
            "api_key": "${env.WATSONX_API_KEY:=}",
            "project_id": "${env.WATSONX_PROJECT_ID:=}",
        }
--- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -255,7 +255,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
        )
    def get_base_url(self) -> str:
-        return self.config.url
+        return str(self.config.base_url)
    # Copied from OpenAIMixin
    async def check_model_availability(self, model: str) -> bool:
@ -316,7 +316,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
        """
        Retrieves foundation model specifications from the watsonx.ai API.
        """
-        url = f"{self.config.url}/ml/v1/foundation_model_specs?version=2023-10-25"
+        url = f"{str(self.config.base_url)}/ml/v1/foundation_model_specs?version=2023-10-25"
        headers = {
            # Note that there is no authorization header.  Listing models does not require authentication.
            "Content-Type": "application/json",
--- a/src/llama_stack/providers/utils/inference/openai_compat.py
+++ b/src/llama_stack/providers/utils/inference/openai_compat.py
@ -3,23 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import Iterable
 from typing import (
    Any,
 )
 from openai.types.chat import (
    ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam,
 )
 try:
    from openai.types.chat import (
        ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
    )
 except ImportError:
    from openai.types.chat.chat_completion_message_tool_call import (
        ChatCompletionMessageToolCall as OpenAIChatCompletionMessageFunctionToolCall,
    )
 from openai.types.chat import (
    ChatCompletionMessageToolCall,
 )
@ -32,18 +19,6 @@ from llama_stack.models.llama.datatypes import (
    ToolCall,
    ToolDefinition,
 )
 from llama_stack_api import (
    URL,
    GreedySamplingStrategy,
    ImageContentItem,
    JsonSchemaResponseFormat,
    OpenAIResponseFormatParam,
    SamplingParams,
    TextContentItem,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
    _URLOrData,
 )
 logger = get_logger(name=__name__, category="providers::utils")
@ -73,42 +48,6 @@ class OpenAICompatCompletionResponse(BaseModel):
    choices: list[OpenAICompatCompletionChoice]
 def get_sampling_strategy_options(params: SamplingParams) -> dict:
    options = {}
    if isinstance(params.strategy, GreedySamplingStrategy):
        options["temperature"] = 0.0
    elif isinstance(params.strategy, TopPSamplingStrategy):
        if params.strategy.temperature is not None:
            options["temperature"] = params.strategy.temperature
        if params.strategy.top_p is not None:
            options["top_p"] = params.strategy.top_p
    elif isinstance(params.strategy, TopKSamplingStrategy):
        options["top_k"] = params.strategy.top_k
    else:
        raise ValueError(f"Unsupported sampling strategy: {params.strategy}")
    return options
 def get_sampling_options(params: SamplingParams | None) -> dict:
    if not params:
        return {}
    options = {}
    if params:
        options.update(get_sampling_strategy_options(params))
        if params.max_tokens:
            options["max_tokens"] = params.max_tokens
        if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
            options["repeat_penalty"] = params.repetition_penalty
        if params.stop is not None:
            options["stop"] = params.stop
    return options
 def text_from_choice(choice) -> str:
    if hasattr(choice, "delta") and choice.delta:
        return choice.delta.content  # type: ignore[no-any-return]  # external OpenAI types lack precise annotations
@ -253,154 +192,6 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
    return out
 def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str:
    """
    Convert a StopReason to an OpenAI chat completion finish_reason.
    """
    return {
        StopReason.end_of_turn: "stop",
        StopReason.end_of_message: "tool_calls",
        StopReason.out_of_tokens: "length",
    }.get(stop_reason, "stop")
 def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
    """
    Convert an OpenAI chat completion finish_reason to a StopReason.
    finish_reason: Literal["stop", "length", "tool_calls", ...]
        - stop: model hit a natural stop point or a provided stop sequence
        - length: maximum number of tokens specified in the request was reached
        - tool_calls: model called a tool
    ->
    class StopReason(Enum):
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    # TODO(mf): are end_of_turn and end_of_message semantics correct?
    return {
        "stop": StopReason.end_of_turn,
        "length": StopReason.out_of_tokens,
        "tool_calls": StopReason.end_of_message,
    }.get(finish_reason, StopReason.end_of_turn)
 def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) -> list[ToolDefinition]:
    lls_tools: list[ToolDefinition] = []
    if not tools:
        return lls_tools
    for tool in tools:
        tool_fn = tool.get("function", {})
        tool_name = tool_fn.get("name", None)
        tool_desc = tool_fn.get("description", None)
        tool_params = tool_fn.get("parameters", None)
        lls_tool = ToolDefinition(
            tool_name=tool_name,
            description=tool_desc,
            input_schema=tool_params,  # Pass through entire JSON Schema
        )
        lls_tools.append(lls_tool)
    return lls_tools
 def _convert_openai_request_response_format(
    response_format: OpenAIResponseFormatParam | None = None,
 ):
    if not response_format:
        return None
    # response_format can be a dict or a pydantic model
    response_format_dict = dict(response_format)  # type: ignore[arg-type]  # OpenAIResponseFormatParam union needs dict conversion
    if response_format_dict.get("type", "") == "json_schema":
        return JsonSchemaResponseFormat(
            type="json_schema",  # type: ignore[arg-type]  # Literal["json_schema"] incompatible with expected type
            json_schema=response_format_dict.get("json_schema", {}).get("schema", ""),
        )
    return None
 def _convert_openai_tool_calls(
    tool_calls: list[OpenAIChatCompletionMessageFunctionToolCall],
 ) -> list[ToolCall]:
    """
    Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
    OpenAI ChatCompletionMessageToolCall:
        id: str
        function: Function
        type: Literal["function"]
    OpenAI Function:
        arguments: str
        name: str
    ->
    ToolCall:
        call_id: str
        tool_name: str
        arguments: Dict[str, ...]
    """
    if not tool_calls:
        return []  # CompletionMessage tool_calls is not optional
    return [
        ToolCall(
            call_id=call.id,
            tool_name=call.function.name,
            arguments=call.function.arguments,
        )
        for call in tool_calls
    ]
 def _convert_openai_sampling_params(
    max_tokens: int | None = None,
    temperature: float | None = None,
    top_p: float | None = None,
 ) -> SamplingParams:
    sampling_params = SamplingParams()
    if max_tokens:
        sampling_params.max_tokens = max_tokens
    # Map an explicit temperature of 0 to greedy sampling
    if temperature == 0:
        sampling_params.strategy = GreedySamplingStrategy()
    else:
        # OpenAI defaults to 1.0 for temperature and top_p if unset
        if temperature is None:
            temperature = 1.0
        if top_p is None:
            top_p = 1.0
        sampling_params.strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)  # type: ignore[assignment]  # SamplingParams.strategy union accepts this type
    return sampling_params
 def openai_content_to_content(content: str | Iterable[OpenAIChatCompletionContentPartParam] | None):
    if content is None:
        return ""
    if isinstance(content, str):
        return content
    elif isinstance(content, list):
        return [openai_content_to_content(c) for c in content]
    elif hasattr(content, "type"):
        if content.type == "text":
            return TextContentItem(type="text", text=content.text)  # type: ignore[attr-defined]  # Iterable narrowed by hasattr check but mypy doesn't track
        elif content.type == "image_url":
            return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))  # type: ignore[attr-defined]  # Iterable narrowed by hasattr check but mypy doesn't track
        else:
            raise ValueError(f"Unknown content type: {content.type}")
    else:
        raise ValueError(f"Unknown content type: {content}")
 async def prepare_openai_completion_params(**params):
    async def _prepare_value(value: Any) -> Any:
        new_value = value
--- a/src/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/src/llama_stack/providers/utils/inference/openai_mixin.py
@ -213,6 +213,19 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
        return api_key
    def _validate_model_allowed(self, provider_model_id: str) -> None:
        """
        Validate that the model is in the allowed_models list if configured.
        :param provider_model_id: The provider-specific model ID to validate
        :raises ValueError: If the model is not in the allowed_models list
        """
        if self.config.allowed_models is not None and provider_model_id not in self.config.allowed_models:
            raise ValueError(
                f"Model '{provider_model_id}' is not in the allowed models list. "
                f"Allowed models: {self.config.allowed_models}"
            )
    async def _get_provider_model_id(self, model: str) -> str:
        """
        Get the provider-specific model ID from the model store.
@ -259,8 +272,11 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
        Direct OpenAI completion API call.
        """
        # TODO: fix openai_completion to return type compatible with OpenAI's API response
        provider_model_id = await self._get_provider_model_id(params.model)
        self._validate_model_allowed(provider_model_id)
        completion_kwargs = await prepare_openai_completion_params(
-            model=await self._get_provider_model_id(params.model),
+            model=provider_model_id,
            prompt=params.prompt,
            best_of=params.best_of,
            echo=params.echo,
@ -292,6 +308,9 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
        """
        Direct OpenAI chat completion API call.
        """
        provider_model_id = await self._get_provider_model_id(params.model)
        self._validate_model_allowed(provider_model_id)
        messages = params.messages
        if self.download_images:
@ -313,7 +332,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
            messages = [await _localize_image_url(m) for m in messages]
        request_params = await prepare_openai_completion_params(
-            model=await self._get_provider_model_id(params.model),
+            model=provider_model_id,
            messages=messages,
            frequency_penalty=params.frequency_penalty,
            function_call=params.function_call,
@ -351,10 +370,13 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
        """
        Direct OpenAI embeddings API call.
        """
        provider_model_id = await self._get_provider_model_id(params.model)
        self._validate_model_allowed(provider_model_id)
        # Build request params conditionally to avoid NotGiven/Omit type mismatch
        # The OpenAI SDK uses Omit in signatures but NOT_GIVEN has type NotGiven
        request_params: dict[str, Any] = {
-            "model": await self._get_provider_model_id(params.model),
+            "model": provider_model_id,
            "input": params.input,
        }
        if params.encoding_format is not None:
--- a/src/llama_stack_api/vector_io.py
+++ b/src/llama_stack_api/vector_io.py
@ -11,7 +11,7 @@
 from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body, Query
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from llama_stack_api.common.tracing import telemetry_traceable
 from llama_stack_api.inference import InterleavedContent
@ -372,6 +372,65 @@ VectorStoreFileStatus = Literal["completed"] | Literal["in_progress"] | Literal[
 register_schema(VectorStoreFileStatus, name="VectorStoreFileStatus")
 # VectorStoreFileAttributes type with OpenAPI constraints
 VectorStoreFileAttributes = Annotated[
    dict[str, Annotated[str, Field(max_length=512)] | float | bool],
    Field(
        max_length=16,
        json_schema_extra={
            "propertyNames": {"type": "string", "maxLength": 64},
            "x-oaiTypeLabel": "map",
        },
        description=(
            "Set of 16 key-value pairs that can be attached to an object. This can be "
            "useful for storing additional information about the object in a structured "
            "format, and querying for objects via API or the dashboard. Keys are strings "
            "with a maximum length of 64 characters. Values are strings with a maximum "
            "length of 512 characters, booleans, or numbers."
        ),
    ),
 ]
 def _sanitize_vector_store_attributes(metadata: dict[str, Any] | None) -> dict[str, str | float | bool]:
    """
    Sanitize metadata to VectorStoreFileAttributes spec (max 16 properties, primitives only).
    Converts dict[str, Any] to dict[str, str | float | bool]:
    - Preserves: str (truncated to 512 chars), bool, int/float (as float)
    - Converts: list -> comma-separated string
    - Filters: dict, None, other types
    - Enforces: max 16 properties, max 64 char keys, max 512 char string values
    """
    if not metadata:
        return {}
    sanitized: dict[str, str | float | bool] = {}
    for key, value in metadata.items():
        # Enforce max 16 properties
        if len(sanitized) >= 16:
            break
        # Enforce max 64 char keys
        if len(key) > 64:
            continue
        # Convert to supported primitive types
        if isinstance(value, bool):
            sanitized[key] = value
        elif isinstance(value, int | float):
            sanitized[key] = float(value)
        elif isinstance(value, str):
            # Enforce max 512 char string values
            sanitized[key] = value[:512] if len(value) > 512 else value
        elif isinstance(value, list):
            # Convert lists to comma-separated strings (max 512 chars)
            list_str = ", ".join(str(item) for item in value)
            sanitized[key] = list_str[:512] if len(list_str) > 512 else list_str
    return sanitized
@json_schema_type
 class VectorStoreFileObject(BaseModel):
    """OpenAI Vector Store File object.
@ -389,7 +448,7 @@ class VectorStoreFileObject(BaseModel):
    id: str
    object: str = "vector_store.file"
-    attributes: dict[str, Any] = Field(default_factory=dict)
+    attributes: VectorStoreFileAttributes = Field(default_factory=dict)
    chunking_strategy: VectorStoreChunkingStrategy
    created_at: int
    last_error: VectorStoreFileLastError | None = None
@ -397,6 +456,12 @@ class VectorStoreFileObject(BaseModel):
    usage_bytes: int = 0
    vector_store_id: str
    @field_validator("attributes", mode="before")
    @classmethod
    def _validate_attributes(cls, v: dict[str, Any] | None) -> dict[str, str | float | bool]:
        """Sanitize attributes to match VectorStoreFileAttributes OpenAPI spec."""
        return _sanitize_vector_store_attributes(v)
@json_schema_type
 class VectorStoreListFilesResponse(BaseModel):
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -211,3 +211,23 @@ def test_asymmetric_embeddings(llama_stack_client, embedding_model_id):
    assert query_response.embeddings is not None
 ```
 ## TypeScript Client Replays
 TypeScript SDK tests can run alongside Python tests when testing against `server:<config>` stacks. Set `TS_CLIENT_PATH` to the path or version of `llama-stack-client-typescript` to enable:
 ```bash
 # Use published npm package (responses suite)
 TS_CLIENT_PATH=^0.3.2 scripts/integration-tests.sh --stack-config server:ci-tests --suite responses --setup gpt
 # Use local checkout from ~/.cache (recommended for development)
 git clone https://github.com/llamastack/llama-stack-client-typescript.git ~/.cache/llama-stack-client-typescript
 TS_CLIENT_PATH=~/.cache/llama-stack-client-typescript scripts/integration-tests.sh --stack-config server:ci-tests --suite responses --setup gpt
 # Run base suite with TypeScript tests
 TS_CLIENT_PATH=~/.cache/llama-stack-client-typescript scripts/integration-tests.sh --stack-config server:ci-tests --suite base --setup ollama
 ```
 TypeScript tests run immediately after Python tests pass, using the same replay fixtures. The mapping between Python suites/setups and TypeScript test files is defined in `tests/integration/client-typescript/suites.json`.
 If `TS_CLIENT_PATH` is unset, TypeScript tests are skipped entirely.
--- a/tests/integration/agents/test_openai_responses.py
+++ b/tests/integration/agents/test_openai_responses.py
@ -516,169 +516,3 @@ def test_response_with_instructions(openai_client, client_with_models, text_mode
    # Verify instructions from previous response was not carried over to the next response
    assert response_with_instructions2.instructions == instructions2
@pytest.mark.skip(reason="Tool calling is not reliable.")
 def test_max_tool_calls_with_function_tools(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with function tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    max_tool_calls = 1
    tools = [
        {
            "type": "function",
            "name": "get_weather",
            "description": "Get weather information for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        },
        {
            "type": "function",
            "name": "get_time",
            "description": "Get current time for a specified location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name (e.g., 'New York', 'London')",
                    },
                },
            },
        },
    ]
    # First create a response that triggers function tools
    response = client.responses.create(
        model=text_model_id,
        input="Can you tell me the weather in Paris and the current time?",
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls,
    )
    # Verify we got two function calls and that the max_tool_calls do not affect function tools
    assert len(response.output) == 2
    assert response.output[0].type == "function_call"
    assert response.output[0].name == "get_weather"
    assert response.output[0].status == "completed"
    assert response.output[1].type == "function_call"
    assert response.output[1].name == "get_time"
    assert response.output[0].status == "completed"
    # Verify we have a valid max_tool_calls field
    assert response.max_tool_calls == max_tool_calls
 def test_max_tool_calls_invalid(openai_client, client_with_models, text_model_id):
    """Test handling of invalid max_tool_calls in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    input = "Search for today's top technology news."
    invalid_max_tool_calls = 0
    tools = [
        {"type": "web_search"},
    ]
    # Create a response with an invalid max_tool_calls value i.e. 0
    # Handle ValueError from LLS and BadRequestError from OpenAI client
    with pytest.raises((ValueError, BadRequestError)) as excinfo:
        client.responses.create(
            model=text_model_id,
            input=input,
            tools=tools,
            stream=False,
            max_tool_calls=invalid_max_tool_calls,
        )
    error_message = str(excinfo.value)
    assert f"Invalid max_tool_calls={invalid_max_tool_calls}; should be >= 1" in error_message, (
        f"Expected error message about invalid max_tool_calls, got: {error_message}"
    )
 def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, text_model_id):
    """Test handling of max_tool_calls with built-in tools in responses."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI responses are not supported when testing with library client yet.")
    client = openai_client
    input = "Search for today's top technology and a positive news story. You MUST make exactly two separate web search calls."
    max_tool_calls = [1, 5]
    tools = [
        {"type": "web_search"},
    ]
    # First create a response that triggers web_search tools without max_tool_calls
    response = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
    )
    # Verify we got two web search calls followed by a message
    assert len(response.output) == 3
    assert response.output[0].type == "web_search_call"
    assert response.output[0].status == "completed"
    assert response.output[1].type == "web_search_call"
    assert response.output[1].status == "completed"
    assert response.output[2].type == "message"
    assert response.output[2].status == "completed"
    assert response.output[2].role == "assistant"
    # Next create a response that triggers web_search tools with max_tool_calls set to 1
    response_2 = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls[0],
    )
    # Verify we got one web search tool call followed by a message
    assert len(response_2.output) == 2
    assert response_2.output[0].type == "web_search_call"
    assert response_2.output[0].status == "completed"
    assert response_2.output[1].type == "message"
    assert response_2.output[1].status == "completed"
    assert response_2.output[1].role == "assistant"
    # Verify we have a valid max_tool_calls field
    assert response_2.max_tool_calls == max_tool_calls[0]
    # Finally create a response that triggers web_search tools with max_tool_calls set to 5
    response_3 = client.responses.create(
        model=text_model_id,
        input=input,
        tools=tools,
        stream=False,
        max_tool_calls=max_tool_calls[1],
    )
    # Verify we got two web search calls followed by a message
    assert len(response_3.output) == 3
    assert response_3.output[0].type == "web_search_call"
    assert response_3.output[0].status == "completed"
    assert response_3.output[1].type == "web_search_call"
    assert response_3.output[1].status == "completed"
    assert response_3.output[2].type == "message"
    assert response_3.output[2].status == "completed"
    assert response_3.output[2].role == "assistant"
    # Verify we have a valid max_tool_calls field
    assert response_3.max_tool_calls == max_tool_calls[1]
--- a/tests/integration/client-typescript/tests/inference.test.ts
+++ b/tests/integration/client-typescript/tests/inference.test.ts
@ -0,0 +1,104 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the terms described in the LICENSE file in
 // the root directory of this source tree.
 /**
 * Integration tests for Inference API (Chat Completions).
 * Ported from: llama-stack/tests/integration/inference/test_openai_completion.py
 *
 * IMPORTANT: Test cases must match EXACTLY with Python tests to use recorded API responses.
 */
 import { createTestClient, requireTextModel } from '../setup';
 describe('Inference API - Chat Completions', () => {
  // Test cases matching llama-stack/tests/integration/test_cases/inference/chat_completion.json
  const chatCompletionTestCases = [
    {
      id: 'non_streaming_01',
      question: 'Which planet do humans live on?',
      expected: 'earth',
      testId:
        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_01]',
    },
    {
      id: 'non_streaming_02',
      question: 'Which planet has rings around it with a name starting with letter S?',
      expected: 'saturn',
      testId:
        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_02]',
    },
  ];
  const streamingTestCases = [
    {
      id: 'streaming_01',
      question: "What's the name of the Sun in latin?",
      expected: 'sol',
      testId:
        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_01]',
    },
    {
      id: 'streaming_02',
      question: 'What is the name of the US captial?',
      expected: 'washington',
      testId:
        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_02]',
    },
  ];
  test.each(chatCompletionTestCases)(
    'chat completion non-streaming: $id',
    async ({ question, expected, testId }) => {
      const client = createTestClient(testId);
      const textModel = requireTextModel();
      const response = await client.chat.completions.create({
        model: textModel,
        messages: [
          {
            role: 'user',
            content: question,
          },
        ],
        stream: false,
      });
      // Non-streaming responses have choices with message property
      const choice = response.choices[0];
      expect(choice).toBeDefined();
      if (!choice || !('message' in choice)) {
        throw new Error('Expected non-streaming response with message');
      }
      const content = choice.message.content;
      expect(content).toBeDefined();
      const messageContent = typeof content === 'string' ? content.toLowerCase().trim() : '';
      expect(messageContent.length).toBeGreaterThan(0);
      expect(messageContent).toContain(expected.toLowerCase());
    },
  );
  test.each(streamingTestCases)('chat completion streaming: $id', async ({ question, expected, testId }) => {
    const client = createTestClient(testId);
    const textModel = requireTextModel();
    const stream = await client.chat.completions.create({
      model: textModel,
      messages: [{ role: 'user', content: question }],
      stream: true,
    });
    const streamedContent: string[] = [];
    for await (const chunk of stream) {
      if (chunk.choices && chunk.choices.length > 0 && chunk.choices[0]?.delta?.content) {
        streamedContent.push(chunk.choices[0].delta.content);
      }
    }
    expect(streamedContent.length).toBeGreaterThan(0);
    const fullContent = streamedContent.join('').toLowerCase().trim();
    expect(fullContent).toContain(expected.toLowerCase());
  });
 });
--- a/tests/integration/client-typescript/tests/responses.test.ts
+++ b/tests/integration/client-typescript/tests/responses.test.ts
@ -0,0 +1,132 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the terms described in the LICENSE file in
 // the root directory of this source tree.
 /**
 * Integration tests for Responses API.
 * Ported from: llama-stack/tests/integration/responses/test_basic_responses.py
 *
 * IMPORTANT: Test cases and IDs must match EXACTLY with Python tests to use recorded API responses.
 */
 import { createTestClient, requireTextModel, getResponseOutputText } from '../setup';
 describe('Responses API - Basic', () => {
  // Test cases matching llama-stack/tests/integration/responses/fixtures/test_cases.py
  const basicTestCases = [
    {
      id: 'earth',
      input: 'Which planet do humans live on?',
      expected: 'earth',
      // Use client_with_models fixture to match non-streaming recordings
      testId:
        'tests/integration/responses/test_basic_responses.py::test_response_non_streaming_basic[client_with_models-txt=openai/gpt-4o-earth]',
    },
    {
      id: 'saturn',
      input: 'Which planet has rings around it with a name starting with letter S?',
      expected: 'saturn',
      testId:
        'tests/integration/responses/test_basic_responses.py::test_response_non_streaming_basic[client_with_models-txt=openai/gpt-4o-saturn]',
    },
  ];
  test.each(basicTestCases)('non-streaming basic response: $id', async ({ input, expected, testId }) => {
    // Create client with test_id for all requests
    const client = createTestClient(testId);
    const textModel = requireTextModel();
    // Create a response
    const response = await client.responses.create({
      model: textModel,
      input,
      stream: false,
    });
    // Verify response has content
    const outputText = getResponseOutputText(response).toLowerCase().trim();
    expect(outputText.length).toBeGreaterThan(0);
    expect(outputText).toContain(expected.toLowerCase());
    // Verify usage is reported
    expect(response.usage).toBeDefined();
    expect(response.usage!.input_tokens).toBeGreaterThan(0);
    expect(response.usage!.output_tokens).toBeGreaterThan(0);
    expect(response.usage!.total_tokens).toBe(response.usage!.input_tokens + response.usage!.output_tokens);
    // Verify stored response matches
    const retrievedResponse = await client.responses.retrieve(response.id);
    expect(getResponseOutputText(retrievedResponse)).toBe(getResponseOutputText(response));
    // Test follow-up with previous_response_id
    const nextResponse = await client.responses.create({
      model: textModel,
      input: 'Repeat your previous response in all caps.',
      previous_response_id: response.id,
    });
    const nextOutputText = getResponseOutputText(nextResponse).trim();
    expect(nextOutputText).toContain(expected.toUpperCase());
  });
  test.each(basicTestCases)('streaming basic response: $id', async ({ input, expected, testId }) => {
    // Modify test_id for streaming variant
    const streamingTestId = testId.replace(
      'test_response_non_streaming_basic',
      'test_response_streaming_basic',
    );
    const client = createTestClient(streamingTestId);
    const textModel = requireTextModel();
    // Create a streaming response
    const stream = await client.responses.create({
      model: textModel,
      input,
      stream: true,
    });
    const events: any[] = [];
    let responseId = '';
    for await (const chunk of stream) {
      events.push(chunk);
      if (chunk.type === 'response.created') {
        // Verify response.created is the first event
        expect(events.length).toBe(1);
        expect(chunk.response.status).toBe('in_progress');
        responseId = chunk.response.id;
      } else if (chunk.type === 'response.completed') {
        // Verify response.completed comes after response.created
        expect(events.length).toBeGreaterThanOrEqual(2);
        expect(chunk.response.status).toBe('completed');
        expect(chunk.response.id).toBe(responseId);
        // Verify content quality
        const outputText = getResponseOutputText(chunk.response).toLowerCase().trim();
        expect(outputText.length).toBeGreaterThan(0);
        expect(outputText).toContain(expected.toLowerCase());
        // Verify usage is reported
        expect(chunk.response.usage).toBeDefined();
        expect(chunk.response.usage!.input_tokens).toBeGreaterThan(0);
        expect(chunk.response.usage!.output_tokens).toBeGreaterThan(0);
        expect(chunk.response.usage!.total_tokens).toBe(
          chunk.response.usage!.input_tokens + chunk.response.usage!.output_tokens,
        );
      }
    }
    // Verify we got both events
    expect(events.length).toBeGreaterThanOrEqual(2);
    const firstEvent = events[0];
    const lastEvent = events[events.length - 1];
    expect(firstEvent.type).toBe('response.created');
    expect(lastEvent.type).toBe('response.completed');
    // Verify stored response matches streamed response
    const retrievedResponse = await client.responses.retrieve(responseId);
    expect(getResponseOutputText(retrievedResponse)).toBe(getResponseOutputText(lastEvent.response));
  });
 });
--- a/tests/integration/client-typescript/jest.integration.config.js
+++ b/tests/integration/client-typescript/jest.integration.config.js
@ -0,0 +1,31 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the terms described in the LICENSE file in
 // the root directory of this source tree.
 /** @type {import('ts-jest').JestConfigWithTsJest} */
 module.exports = {
  preset: 'ts-jest/presets/default-esm',
  testEnvironment: 'node',
  extensionsToTreatAsEsm: ['.ts'],
  moduleNameMapper: {
    '^(\\.{1,2}/.*)\\.js$': '$1',
  },
  transform: {
    '^.+\\.tsx?$': [
      'ts-jest',
      {
        useESM: true,
        tsconfig: {
          module: 'ES2022',
          moduleResolution: 'bundler',
        },
      },
    ],
  },
  testMatch: ['<rootDir>/__tests__/**/*.test.ts'],
  setupFilesAfterEnv: ['<rootDir>/setup.ts'],
  testTimeout: 60000, // 60 seconds (integration tests can be slow)
  watchman: false, // Disable watchman to avoid permission issues
 };
--- a/tests/integration/client-typescript/package-lock.json
+++ b/tests/integration/client-typescript/package-lock.json
--- a/tests/integration/client-typescript/package.json
+++ b/tests/integration/client-typescript/package.json
@ -0,0 +1,18 @@
 {
  "name": "llama-stack-typescript-integration-tests",
  "version": "0.0.1",
  "private": true,
  "description": "TypeScript client integration tests for Llama Stack",
  "scripts": {
    "test": "node run-tests.js"
  },
  "devDependencies": {
    "@swc/core": "^1.3.102",
    "@swc/jest": "^0.2.29",
    "@types/jest": "^29.4.0",
    "@types/node": "^20.0.0",
    "jest": "^29.4.0",
    "ts-jest": "^29.1.0",
    "typescript": "^5.0.0"
  }
 }
--- a/tests/integration/client-typescript/run-tests.js
+++ b/tests/integration/client-typescript/run-tests.js
@ -0,0 +1,63 @@
 #!/usr/bin/env node
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the terms described in the LICENSE file in
 // the root directory of this source tree.
 /**
 * Test runner that finds and executes TypeScript tests based on suite/setup mapping.
 * Called by integration-tests.sh via npm test.
 */
 const fs = require('fs');
 const path = require('path');
 const { execSync } = require('child_process');
 const suite = process.env.LLAMA_STACK_TEST_SUITE;
 const setup = process.env.LLAMA_STACK_TEST_SETUP || '';
 if (!suite) {
  console.error('Error: LLAMA_STACK_TEST_SUITE environment variable is required');
  process.exit(1);
 }
 // Read suites.json to find matching test files
 const suitesPath = path.join(__dirname, 'suites.json');
 if (!fs.existsSync(suitesPath)) {
  console.log(`No TypeScript tests configured (${suitesPath} not found)`);
  process.exit(0);
 }
 const suites = JSON.parse(fs.readFileSync(suitesPath, 'utf-8'));
 // Find matching entry
 let testFiles = [];
 for (const entry of suites) {
  if (entry.suite !== suite) {
    continue;
  }
  const entrySetup = entry.setup || '';
  if (entrySetup && entrySetup !== setup) {
    continue;
  }
  testFiles = entry.files || [];
  break;
 }
 if (testFiles.length === 0) {
  console.log(`No TypeScript integration tests mapped for suite ${suite} (setup ${setup})`);
  process.exit(0);
 }
 console.log(`Running TypeScript tests for suite ${suite} (setup ${setup}): ${testFiles.join(', ')}`);
 // Run Jest with the mapped test files
 try {
  execSync(`npx jest --config jest.integration.config.js ${testFiles.join(' ')}`, {
    stdio: 'inherit',
    cwd: __dirname,
  });
 } catch (error) {
  process.exit(error.status || 1);
 }
--- a/tests/integration/client-typescript/setup.ts
+++ b/tests/integration/client-typescript/setup.ts
@ -0,0 +1,162 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 // All rights reserved.
 //
 // This source code is licensed under the terms described in the LICENSE file in
 // the root directory of this source tree.
 /**
 * Global setup for integration tests.
 * This file mimics pytest's fixture system by providing shared test configuration.
 */
 import LlamaStackClient from 'llama-stack-client';
 /**
 * Load test configuration from the Python setup system.
 * This reads setup definitions from tests/integration/suites.py via get_setup_env.py.
 */
 function loadTestConfig() {
  const baseURL = process.env['TEST_API_BASE_URL'];
  const setupName = process.env['LLAMA_STACK_TEST_SETUP'];
  const textModel = process.env['LLAMA_STACK_TEST_TEXT_MODEL'];
  const embeddingModel = process.env['LLAMA_STACK_TEST_EMBEDDING_MODEL'];
  if (!baseURL) {
    throw new Error(
      'TEST_API_BASE_URL is required for integration tests. ' +
        'Run tests using: ./scripts/integration-test.sh',
    );
  }
  return {
    baseURL,
    textModel,
    embeddingModel,
    setupName,
  };
 }
 // Read configuration from environment variables (set by scripts/integration-test.sh)
 export const TEST_CONFIG = loadTestConfig();
 // Validate required configuration
 beforeAll(() => {
  console.log('\n=== Integration Test Configuration ===');
  console.log(`Base URL: ${TEST_CONFIG.baseURL}`);
  console.log(`Setup: ${TEST_CONFIG.setupName || 'NOT SET'}`);
  console.log(
    `Text Model: ${TEST_CONFIG.textModel || 'NOT SET - tests requiring text model will be skipped'}`,
  );
  console.log(
    `Embedding Model: ${
      TEST_CONFIG.embeddingModel || 'NOT SET - tests requiring embedding model will be skipped'
    }`,
  );
  console.log('=====================================\n');
 });
 /**
 * Create a client instance for integration tests.
 * Mimics pytest's `llama_stack_client` fixture.
 *
 * @param testId - Test ID to send in X-LlamaStack-Provider-Data header for replay mode.
 *                 Format: "tests/integration/responses/test_basic_responses.py::test_name[params]"
 */
 export function createTestClient(testId?: string): LlamaStackClient {
  const headers: Record<string, string> = {};
  // In server mode with replay, send test ID for recording isolation
  if (process.env['LLAMA_STACK_TEST_STACK_CONFIG_TYPE'] === 'server' && testId) {
    headers['X-LlamaStack-Provider-Data'] = JSON.stringify({
      __test_id: testId,
    });
  }
  return new LlamaStackClient({
    baseURL: TEST_CONFIG.baseURL,
    timeout: 60000, // 60 seconds
    defaultHeaders: headers,
  });
 }
 /**
 * Skip test if required model is not configured.
 * Mimics pytest's `skip_if_no_model` autouse fixture.
 */
 export function skipIfNoModel(modelType: 'text' | 'embedding'): typeof test {
  const model = modelType === 'text' ? TEST_CONFIG.textModel : TEST_CONFIG.embeddingModel;
  if (!model) {
    const envVar = modelType === 'text' ? 'LLAMA_STACK_TEST_TEXT_MODEL' : 'LLAMA_STACK_TEST_EMBEDDING_MODEL';
    const message = `Skipping: ${modelType} model not configured (set ${envVar})`;
    return test.skip.bind(test) as typeof test;
  }
  return test;
 }
 /**
 * Get the configured text model, throwing if not set.
 * Use this in tests that absolutely require a text model.
 */
 export function requireTextModel(): string {
  if (!TEST_CONFIG.textModel) {
    throw new Error(
      'LLAMA_STACK_TEST_TEXT_MODEL environment variable is required. ' +
        'Run tests using: ./scripts/integration-test.sh',
    );
  }
  return TEST_CONFIG.textModel;
 }
 /**
 * Get the configured embedding model, throwing if not set.
 * Use this in tests that absolutely require an embedding model.
 */
 export function requireEmbeddingModel(): string {
  if (!TEST_CONFIG.embeddingModel) {
    throw new Error(
      'LLAMA_STACK_TEST_EMBEDDING_MODEL environment variable is required. ' +
        'Run tests using: ./scripts/integration-test.sh',
    );
  }
  return TEST_CONFIG.embeddingModel;
 }
 /**
 * Extracts aggregated text output from a ResponseObject.
 * This concatenates all text content from the response's output array.
 *
 * Copied from llama-stack-client's response-helpers until it's available in published version.
 */
 export function getResponseOutputText(response: any): string {
  const pieces: string[] = [];
  for (const output of response.output ?? []) {
    if (!output || output.type !== 'message') {
      continue;
    }
    const content = output.content;
    if (typeof content === 'string') {
      pieces.push(content);
      continue;
    }
    if (!Array.isArray(content)) {
      continue;
    }
    for (const item of content) {
      if (typeof item === 'string') {
        pieces.push(item);
        continue;
      }
      if (item && item.type === 'output_text' && 'text' in item && typeof item.text === 'string') {
        pieces.push(item.text);
      }
    }
  }
  return pieces.join('');
 }
--- a/tests/integration/client-typescript/suites.json
+++ b/tests/integration/client-typescript/suites.json
@ -0,0 +1,12 @@
 [
  {
    "suite": "responses",
    "setup": "gpt",
    "files": ["__tests__/responses.test.ts"]
  },
  {
    "suite": "base",
    "setup": "ollama",
    "files": ["__tests__/inference.test.ts"]
  }
 ]
--- a/tests/integration/client-typescript/tsconfig.json
+++ b/tests/integration/client-typescript/tsconfig.json
@ -0,0 +1,16 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "module": "ES2022",
    "lib": ["ES2022"],
    "moduleResolution": "bundler",
    "esModuleInterop": true,
    "allowSyntheticDefaultImports": true,
    "strict": true,
    "skipLibCheck": true,
    "resolveJsonModule": true,
    "types": ["jest", "node"]
  },
  "include": ["**/*.ts"],
  "exclude": ["node_modules"]
 }
--- a/tests/integration/responses/recordings/1997dc007d202497ce456683d24ddde3553f0db5d5a673146d8bb99c072e77cd.json
+++ b/tests/integration/responses/recordings/1997dc007d202497ce456683d24ddde3553f0db5d5a673146d8bb99c072e77cd.json
@ -0,0 +1,773 @@
 {
  "test_id": "tests/integration/responses/test_tool_responses.py::test_max_tool_calls_with_mcp_tools[client_with_models-txt=openai/gpt-4o]",
  "request": {
    "method": "POST",
    "url": "https://api.openai.com/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-4o",
      "messages": [
        {
          "role": "user",
          "content": "Get the experiment ID for 'boiling_point' and get the user ID for 'charlie'"
        }
      ],
      "stream": true,
      "stream_options": {
        "include_usage": true
      },
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_user_id",
            "description": "\n        Get the user ID for a given username. This ID is needed for other operations.\n\n        :param username: The username to look up\n        :return: The user ID for the username\n        ",
            "parameters": {
              "properties": {
                "username": {
                  "title": "Username",
                  "type": "string"
                }
              },
              "required": [
                "username"
              ],
              "title": "get_user_idArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_user_permissions",
            "description": "\n        Get the permissions for a user ID. Requires a valid user ID from get_user_id.\n\n        :param user_id: The user ID to check permissions for\n        :return: The permissions for the user\n        ",
            "parameters": {
              "properties": {
                "user_id": {
                  "title": "User Id",
                  "type": "string"
                }
              },
              "required": [
                "user_id"
              ],
              "title": "get_user_permissionsArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "check_file_access",
            "description": "\n        Check if a user can access a specific file. Requires a valid user ID.\n\n        :param user_id: The user ID to check access for\n        :param filename: The filename to check access to\n        :return: Whether the user can access the file (yes/no)\n        ",
            "parameters": {
              "properties": {
                "user_id": {
                  "title": "User Id",
                  "type": "string"
                },
                "filename": {
                  "title": "Filename",
                  "type": "string"
                }
              },
              "required": [
                "user_id",
                "filename"
              ],
              "title": "check_file_accessArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_experiment_id",
            "description": "\n        Get the experiment ID for a given experiment name. This ID is needed to get results.\n\n        :param experiment_name: The name of the experiment\n        :return: The experiment ID\n        ",
            "parameters": {
              "properties": {
                "experiment_name": {
                  "title": "Experiment Name",
                  "type": "string"
                }
              },
              "required": [
                "experiment_name"
              ],
              "title": "get_experiment_idArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_experiment_results",
            "description": "\n        Get the results for an experiment ID. Requires a valid experiment ID from get_experiment_id.\n\n        :param experiment_id: The experiment ID to get results for\n        :return: The experiment results\n        ",
            "parameters": {
              "properties": {
                "experiment_id": {
                  "title": "Experiment Id",
                  "type": "string"
                }
              },
              "required": [
                "experiment_id"
              ],
              "title": "get_experiment_resultsArguments",
              "type": "object"
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-4o"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "1V9w3bXnppL"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": "call_y8S7JKR2Qhu4Bh1uxdHRcNDg",
                    "function": {
                      "arguments": "",
                      "name": "get_experiment_id"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "YEsj"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "{\"ex",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "n"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "perim",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "Q"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "ent_na",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "me\":",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "U"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": " \"boi",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "ling_p",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "oint",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "ha"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "d5D"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": "call_HELkyZOm2fzLx2CeTH3bEcS2",
                    "function": {
                      "arguments": "",
                      "name": "get_user_id"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "0LbsjDcKz6"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "{\"us",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "c"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "ernam",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "9"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "e\": \"c",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "7C0WFn181I3y3l"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "harl",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "wf"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "ie\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "r"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "tool_calls",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "FAci"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-1997dc007d20",
          "choices": [],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": {
            "completion_tokens": 51,
            "prompt_tokens": 393,
            "total_tokens": 444,
            "completion_tokens_details": {
              "accepted_prediction_tokens": 0,
              "audio_tokens": 0,
              "reasoning_tokens": 0,
              "rejected_prediction_tokens": 0
            },
            "prompt_tokens_details": {
              "audio_tokens": 0,
              "cached_tokens": 0
            }
          },
          "obfuscation": "6xgpRRdKjviPT"
        }
      }
    ],
    "is_streaming": true
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/responses/recordings/463ab0e2f2914026cfa3c742259c43af318468eb4ef84fd4008ebb40824b7e86.json
+++ b/tests/integration/responses/recordings/463ab0e2f2914026cfa3c742259c43af318468eb4ef84fd4008ebb40824b7e86.json
@ -0,0 +1,593 @@
 {
  "test_id": "tests/integration/responses/test_tool_responses.py::test_max_tool_calls_with_function_tools[openai_client-txt=openai/gpt-4o]",
  "request": {
    "method": "POST",
    "url": "https://api.openai.com/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-4o",
      "messages": [
        {
          "role": "user",
          "content": "Can you tell me the weather in Paris and the current time?"
        }
      ],
      "stream": true,
      "stream_options": {
        "include_usage": true
      },
      "tools": [
        {
          "type": "function",
          "function": {
            "type": "function",
            "name": "get_weather",
            "description": "Get weather information for a specified location",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "The city name (e.g., 'New York', 'London')"
                }
              }
            },
            "strict": null
          }
        },
        {
          "type": "function",
          "function": {
            "type": "function",
            "name": "get_time",
            "description": "Get current time for a specified location",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "The city name (e.g., 'New York', 'London')"
                }
              }
            },
            "strict": null
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-4o"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "QmTXstGvpa8"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": "call_HJMoLtHXfCzhlMQOfqIKt0n3",
                    "function": {
                      "arguments": "",
                      "name": "get_weather"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "iFjmkK23KL"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "{\"lo",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "7"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "catio",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "L"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "n\": \"P",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "THa6gWbrWhVmZ6"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "aris",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "eL"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "jng"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": "call_vGKvTKZM7aALMaUw3Jas7lRg",
                    "function": {
                      "arguments": "",
                      "name": "get_time"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "LSailgMcgSl54"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "{\"lo",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "z"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "catio",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "4"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "n\": \"P",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "0engr6vRvqXTEP"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "aris",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "Pe"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "LU9"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "tool_calls",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": null,
          "obfuscation": "kD7d"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-463ab0e2f291",
          "choices": [],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_b1442291a8",
          "usage": {
            "completion_tokens": 44,
            "prompt_tokens": 110,
            "total_tokens": 154,
            "completion_tokens_details": {
              "accepted_prediction_tokens": 0,
              "audio_tokens": 0,
              "reasoning_tokens": 0,
              "rejected_prediction_tokens": 0
            },
            "prompt_tokens_details": {
              "audio_tokens": 0,
              "cached_tokens": 0
            }
          },
          "obfuscation": "R4ICoxqTqj7ZY"
        }
      }
    ],
    "is_streaming": true
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/responses/recordings/b218af7fa0663e60b12633f54cfddbcf60a1fedd85c501850b9f7e759443809f.json
+++ b/tests/integration/responses/recordings/b218af7fa0663e60b12633f54cfddbcf60a1fedd85c501850b9f7e759443809f.json
@ -0,0 +1,773 @@
 {
  "test_id": "tests/integration/responses/test_tool_responses.py::test_max_tool_calls_with_mcp_tools[openai_client-txt=openai/gpt-4o]",
  "request": {
    "method": "POST",
    "url": "https://api.openai.com/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "gpt-4o",
      "messages": [
        {
          "role": "user",
          "content": "Get the experiment ID for 'boiling_point' and get the user ID for 'charlie'"
        }
      ],
      "stream": true,
      "stream_options": {
        "include_usage": true
      },
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_user_id",
            "description": "\n        Get the user ID for a given username. This ID is needed for other operations.\n\n        :param username: The username to look up\n        :return: The user ID for the username\n        ",
            "parameters": {
              "properties": {
                "username": {
                  "title": "Username",
                  "type": "string"
                }
              },
              "required": [
                "username"
              ],
              "title": "get_user_idArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_user_permissions",
            "description": "\n        Get the permissions for a user ID. Requires a valid user ID from get_user_id.\n\n        :param user_id: The user ID to check permissions for\n        :return: The permissions for the user\n        ",
            "parameters": {
              "properties": {
                "user_id": {
                  "title": "User Id",
                  "type": "string"
                }
              },
              "required": [
                "user_id"
              ],
              "title": "get_user_permissionsArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "check_file_access",
            "description": "\n        Check if a user can access a specific file. Requires a valid user ID.\n\n        :param user_id: The user ID to check access for\n        :param filename: The filename to check access to\n        :return: Whether the user can access the file (yes/no)\n        ",
            "parameters": {
              "properties": {
                "user_id": {
                  "title": "User Id",
                  "type": "string"
                },
                "filename": {
                  "title": "Filename",
                  "type": "string"
                }
              },
              "required": [
                "user_id",
                "filename"
              ],
              "title": "check_file_accessArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_experiment_id",
            "description": "\n        Get the experiment ID for a given experiment name. This ID is needed to get results.\n\n        :param experiment_name: The name of the experiment\n        :return: The experiment ID\n        ",
            "parameters": {
              "properties": {
                "experiment_name": {
                  "title": "Experiment Name",
                  "type": "string"
                }
              },
              "required": [
                "experiment_name"
              ],
              "title": "get_experiment_idArguments",
              "type": "object"
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "get_experiment_results",
            "description": "\n        Get the results for an experiment ID. Requires a valid experiment ID from get_experiment_id.\n\n        :param experiment_id: The experiment ID to get results for\n        :return: The experiment results\n        ",
            "parameters": {
              "properties": {
                "experiment_id": {
                  "title": "Experiment Id",
                  "type": "string"
                }
              },
              "required": [
                "experiment_id"
              ],
              "title": "get_experiment_resultsArguments",
              "type": "object"
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "gpt-4o"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "N5OTLR9CfmU"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": "call_z8P1RQv54BLxyMlRdMFkcCGd",
                    "function": {
                      "arguments": "",
                      "name": "get_experiment_id"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "3EKK"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "{\"ex",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "R"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "perim",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "Q"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "ent_na",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "me\":",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "6"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": " \"boi",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "ling_p",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": ""
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "oint",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "pw"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 0,
                    "id": null,
                    "function": {
                      "arguments": "\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "Gfk"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": "call_I5tcLgyMADoVwLKDj9HkTCs5",
                    "function": {
                      "arguments": "",
                      "name": "get_user_id"
                    },
                    "type": "function"
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "Yp7IueDs5V"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "{\"us",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "8"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "ernam",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "X"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "e\": \"c",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "2oif8BwVnTCnAF"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "harl",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "hv"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": [
                  {
                    "index": 1,
                    "id": null,
                    "function": {
                      "arguments": "ie\"}",
                      "name": null
                    },
                    "type": null
                  }
                ]
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "C"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [
            {
              "delta": {
                "content": null,
                "function_call": null,
                "refusal": null,
                "role": null,
                "tool_calls": null
              },
              "finish_reason": "tool_calls",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": null,
          "obfuscation": "ctjO"
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-b218af7fa066",
          "choices": [],
          "created": 0,
          "model": "gpt-4o-2024-08-06",
          "object": "chat.completion.chunk",
          "service_tier": "default",
          "system_fingerprint": "fp_c98e05ca17",
          "usage": {
            "completion_tokens": 51,
            "prompt_tokens": 393,
            "total_tokens": 444,
            "completion_tokens_details": {
              "accepted_prediction_tokens": 0,
              "audio_tokens": 0,
              "reasoning_tokens": 0,
              "rejected_prediction_tokens": 0
            },
            "prompt_tokens_details": {
              "audio_tokens": 0,
              "cached_tokens": 0
            }
          },
          "obfuscation": "fclbZeBSSKN4C"
        }
      }
    ],
    "is_streaming": true
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/responses/recordings/b2b5903325356ef0d90af4f2bb8c2a685da5e743820a68de74640451f0072184.json
+++ b/tests/integration/responses/recordings/b2b5903325356ef0d90af4f2bb8c2a685da5e743820a68de74640451f0072184.json
--- a/Show more
+++ b/Show more
`@ -8,4 +8,4 @@ from . import NVIDIAConfig`


	`def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:`	`def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:`
	`return "integrate.api.nvidia.com" in config.url`	`return "integrate.api.nvidia.com" in str(config.base_url)`