Merge branch 'main' into fix/issue-3797-metadata-validation

2025-12-03 01:48:05 +00:00 · 2025-11-19 10:09:30 -08:00 · 2025-11-19 10:09:30 -08:00 · 0358770791
commit 0358770791
parent aeb3f80966 1e4e02e622
90 changed files with 6769 additions and 247 deletions
--- a/.github/actions/setup-typescript-client/action.yml
+++ b/.github/actions/setup-typescript-client/action.yml
@ -0,0 +1,35 @@
+name: Setup TypeScript client
+description: Conditionally checkout and link llama-stack-client-typescript based on client-version
+inputs:
+  client-version:
+    description: 'Client version (latest or published)'
+    required: true
+
+outputs:
+  ts-client-path:
+    description: 'Path or version to use for TypeScript client'
+    value: ${{ steps.set-path.outputs.ts-client-path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Checkout TypeScript client (latest)
+      if: ${{ inputs.client-version == 'latest' }}
+      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      with:
+        repository: llamastack/llama-stack-client-typescript
+        ref: main
+        path: .ts-client-checkout
+
+    - name: Set TS_CLIENT_PATH
+      id: set-path
+      shell: bash
+      run: |
+        if [ "${{ inputs.client-version }}" = "latest" ]; then
+          echo "ts-client-path=${{ github.workspace }}/.ts-client-checkout" >> $GITHUB_OUTPUT
+        elif [ "${{ inputs.client-version }}" = "published" ]; then
+          echo "ts-client-path=^0.3.2" >> $GITHUB_OUTPUT
+        else
+          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
+          exit 1
+        fi
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -93,11 +93,27 @@ jobs:
          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'

+      - name: Setup Node.js for TypeScript client tests
+        if: ${{ matrix.client == 'server' }}
+        uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: tests/integration/client-typescript/package-lock.json
+
+      - name: Setup TypeScript client
+        if: ${{ matrix.client == 'server' }}
+        id: setup-ts-client
+        uses: ./.github/actions/setup-typescript-client
+        with:
+          client-version: ${{ matrix.client-version }}
+
      - name: Run tests
        if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
        uses: ./.github/actions/run-and-record-tests
        env:
          OPENAI_API_KEY: dummy
+          TS_CLIENT_PATH: ${{ steps.setup-ts-client.outputs.ts-client-path || '' }}
        with:
          stack-config: >-
            ${{ matrix.config.stack_config
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -59,6 +59,30 @@ jobs:
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2

+      # Compute the Stainless branch name, prefixing with fork owner if PR is from a fork.
+      # For fork PRs like "contributor:fix/issue-123", this creates "preview/contributor/fix/issue-123"
+      # For same-repo PRs, this creates "preview/fix/issue-123"
+      - name: Compute branch names
+        id: branch-names
+        run: |
+          HEAD_REPO="${{ github.event.pull_request.head.repo.full_name }}"
+          BASE_REPO="${{ github.repository }}"
+          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
+
+          if [ "$HEAD_REPO" != "$BASE_REPO" ]; then
+            # Fork PR: prefix with fork owner for isolation
+            FORK_OWNER="${{ github.event.pull_request.head.repo.owner.login }}"
+            PREVIEW_BRANCH="preview/${FORK_OWNER}/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${FORK_OWNER}/${BRANCH_NAME}"
+          else
+            # Same-repo PR
+            PREVIEW_BRANCH="preview/${BRANCH_NAME}"
+            BASE_BRANCH="preview/base/${BRANCH_NAME}"
+          fi
+
+          echo "preview_branch=${PREVIEW_BRANCH}" >> $GITHUB_OUTPUT
+          echo "base_branch=${BASE_BRANCH}" >> $GITHUB_OUTPUT
+
      # This action builds preview SDKs from the OpenAPI spec changes and
      # posts/updates a comment on the PR with build results and links to the preview.
      - name: Run preview builds
@ -73,6 +97,8 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
+          branch: ${{ steps.branch-names.outputs.preview_branch }}
+          base_branch: ${{ steps.branch-names.outputs.base_branch }}

  merge:
    if: github.event.action == 'closed' && github.event.pull_request.merged == true
@ -90,12 +116,33 @@ jobs:
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2

+      # Compute the Stainless branch name, prefixing with fork owner if PR is from a fork.
+      # For fork PRs like "contributor:fix/issue-123", this creates "preview/contributor/fix/issue-123"
+      # For same-repo PRs, this creates "preview/fix/issue-123"
+      - name: Compute branch names
+        id: branch-names
+        run: |
+          HEAD_REPO="${{ github.event.pull_request.head.repo.full_name }}"
+          BASE_REPO="${{ github.repository }}"
+          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
+
+          if [ "$HEAD_REPO" != "$BASE_REPO" ]; then
+            # Fork PR: prefix with fork owner for isolation
+            FORK_OWNER="${{ github.event.pull_request.head.repo.owner.login }}"
+            MERGE_BRANCH="preview/${FORK_OWNER}/${BRANCH_NAME}"
+          else
+            # Same-repo PR
+            MERGE_BRANCH="preview/${BRANCH_NAME}"
+          fi
+
+          echo "merge_branch=${MERGE_BRANCH}" >> $GITHUB_OUTPUT
+
      # Note that this only merges in changes that happened on the last build on
-      # preview/${{ github.head_ref }}. It's possible that there are OAS/config
-      # changes that haven't been built, if the preview-sdk job didn't finish
+      # the computed preview branch. It's possible that there are OAS/config
+      # changes that haven't been built, if the preview job didn't finish
      # before this step starts. In theory we want to wait for all builds
-      # against preview/${{ github.head_ref }} to complete, but assuming that
-      # the preview-sdk job happens before the PR merge, it should be fine.
+      # against the preview branch to complete, but assuming that
+      # the preview job happens before the PR merge, it should be fine.
      - name: Run merge build
        uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
@ -108,3 +155,4 @@ jobs:
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
+          merge_branch: ${{ steps.branch-names.outputs.merge_branch }}
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,5 @@ docs/static/imported-files/
 docs/docs/api-deprecated/
 docs/docs/api-experimental/
 docs/docs/api/
+tests/integration/client-typescript/node_modules/
+.ts-client-checkout/
--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -24,7 +24,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `api_base` | `HttpUrl` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
+| `base_url` | `HttpUrl \| None` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
 | `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |

@ -32,7 +32,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview

 ```yaml
 api_key: ${env.AZURE_API_KEY:=}
-api_base: ${env.AZURE_API_BASE:=}
+base_url: ${env.AZURE_API_BASE:=}
 api_version: ${env.AZURE_API_VERSION:=}
 api_type: ${env.AZURE_API_TYPE:=}
 ```
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -17,11 +17,11 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `str` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
+| `base_url` | `HttpUrl \| None` | No | https://api.cerebras.ai/v1 | Base URL for the Cerebras API |

 ## Sample Configuration

 ```yaml
-base_url: https://api.cerebras.ai
+base_url: https://api.cerebras.ai/v1
 api_key: ${env.CEREBRAS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -17,11 +17,11 @@ Databricks inference provider for running models on Databricks' unified analytic
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The Databricks API token |
-| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Databricks model serving endpoint (should include /serving-endpoints path) |

 ## Sample Configuration

 ```yaml
-url: ${env.DATABRICKS_HOST:=}
+base_url: ${env.DATABRICKS_HOST:=}
 api_token: ${env.DATABRICKS_TOKEN:=}
 ```
--- a/docs/docs/providers/inference/remote_fireworks.mdx
+++ b/docs/docs/providers/inference/remote_fireworks.mdx
@ -17,11 +17,11 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
+| `base_url` | `HttpUrl \| None` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |

 ## Sample Configuration

 ```yaml
-url: https://api.fireworks.ai/inference/v1
+base_url: https://api.fireworks.ai/inference/v1
 api_key: ${env.FIREWORKS_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -17,11 +17,11 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.groq.com | The URL for the Groq AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.groq.com/openai/v1 | The URL for the Groq AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.groq.com
+base_url: https://api.groq.com/openai/v1
 api_key: ${env.GROQ_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -17,11 +17,11 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `openai_compat_api_base` | `str` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
+| `base_url` | `HttpUrl \| None` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

 ## Sample Configuration

 ```yaml
-openai_compat_api_base: https://api.llama.com/compat/v1/
+base_url: https://api.llama.com/compat/v1/
 api_key: ${env.LLAMA_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -17,15 +17,13 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
+| `base_url` | `HttpUrl \| None` | No | https://integrate.api.nvidia.com/v1 | A base url for accessing the NVIDIA NIM |
 | `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
-| `append_api_version` | `bool` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
 | `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |

 ## Sample Configuration

 ```yaml
-url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
 api_key: ${env.NVIDIA_API_KEY:=}
-append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
 ```
--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -16,10 +16,10 @@ Ollama inference provider for running local models through the Ollama runtime.
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `str` | No | http://localhost:11434 |  |
+| `base_url` | `HttpUrl \| None` | No | http://localhost:11434/v1 |  |

 ## Sample Configuration

 ```yaml
-url: ${env.OLLAMA_URL:=http://localhost:11434}
+base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
 ```
--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -17,7 +17,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `base_url` | `str` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
+| `base_url` | `HttpUrl \| None` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

 ## Sample Configuration

--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -17,11 +17,11 @@ Passthrough inference provider for connecting to any external inference service
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No |  | The URL for the passthrough endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the passthrough endpoint |

 ## Sample Configuration

 ```yaml
-url: ${env.PASSTHROUGH_URL}
+base_url: ${env.PASSTHROUGH_URL}
 api_key: ${env.PASSTHROUGH_API_KEY}
 ```
--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -17,11 +17,11 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the Runpod model serving endpoint |

 ## Sample Configuration

 ```yaml
-url: ${env.RUNPOD_URL:=}
+base_url: ${env.RUNPOD_URL:=}
 api_token: ${env.RUNPOD_API_TOKEN}
 ```
--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -17,11 +17,11 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.sambanova.ai/v1
+base_url: https://api.sambanova.ai/v1
 api_key: ${env.SAMBANOVA_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -16,10 +16,10 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
-| `url` | `str` | No |  | The URL for the TGI serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the TGI serving endpoint (should include /v1 path) |

 ## Sample Configuration

 ```yaml
-url: ${env.TGI_URL:=}
+base_url: ${env.TGI_URL:=}
 ```
--- a/docs/docs/providers/inference/remote_together.mdx
+++ b/docs/docs/providers/inference/remote_together.mdx
@ -17,11 +17,11 @@ Together AI inference provider for open-source models and collaborative AI devel
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
+| `base_url` | `HttpUrl \| None` | No | https://api.together.xyz/v1 | The URL for the Together AI server |

 ## Sample Configuration

 ```yaml
-url: https://api.together.xyz/v1
+base_url: https://api.together.xyz/v1
 api_key: ${env.TOGETHER_API_KEY:=}
 ```
--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -17,14 +17,14 @@ Remote vLLM inference provider for connecting to vLLM servers.
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_token` | `SecretStr \| None` | No |  | The API token |
-| `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
+| `base_url` | `HttpUrl \| None` | No |  | The URL for the vLLM model serving endpoint |
 | `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. |
 | `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |

 ## Sample Configuration

 ```yaml
-url: ${env.VLLM_URL:=}
+base_url: ${env.VLLM_URL:=}
 max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
 api_token: ${env.VLLM_API_TOKEN:=fake}
 tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -17,14 +17,14 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | `allowed_models` | `list[str] \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
 | `api_key` | `SecretStr \| None` | No |  | Authentication credential for the provider |
-| `url` | `str` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
+| `base_url` | `HttpUrl \| None` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
 | `project_id` | `str \| None` | No |  | The watsonx.ai project ID |
 | `timeout` | `int` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration

 ```yaml
-url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
+base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
 api_key: ${env.WATSONX_API_KEY:=}
 project_id: ${env.WATSONX_PROJECT_ID:=}
 ```
--- a/scripts/docker.sh
+++ b/scripts/docker.sh
@ -287,9 +287,9 @@ start_container() {
    # On macOS/Windows, use host.docker.internal to reach host from container
    # On Linux with --network host, use localhost
    if [[ "$(uname)" == "Darwin" ]] || [[ "$(uname)" == *"MINGW"* ]]; then
-        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://host.docker.internal:11434/v1}"
    else
-        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+        OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434/v1}"
    fi
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"

--- a/scripts/get_setup_env.py
+++ b/scripts/get_setup_env.py
@ -16,16 +16,16 @@ import sys
 from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS


-def get_setup_env_vars(setup_name, suite_name=None):
+def get_setup_config(setup_name, suite_name=None):
    """
-    Get environment variables for a setup, with optional suite default fallback.
+    Get full configuration (env vars + defaults) for a setup.

    Args:
        setup_name: Name of the setup (e.g., 'ollama', 'gpt')
        suite_name: Optional suite name to get default setup if setup_name is None

    Returns:
-        Dictionary of environment variables
+        Dictionary with 'env' and 'defaults' keys
    """
    # If no setup specified, try to get default from suite
    if not setup_name and suite_name:
@ -34,7 +34,7 @@ def get_setup_env_vars(setup_name, suite_name=None):
            setup_name = suite.default_setup

    if not setup_name:
-        return {}
+        return {"env": {}, "defaults": {}}

    setup = SETUP_DEFINITIONS.get(setup_name)
    if not setup:
@ -44,27 +44,31 @@ def get_setup_env_vars(setup_name, suite_name=None):
        )
        sys.exit(1)

-    return setup.env
+    return {"env": setup.env, "defaults": setup.defaults}


 def main():
-    parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
+    parser = argparse.ArgumentParser(description="Extract environment variables and defaults from a test setup")
    parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
    parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
    parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")

    args = parser.parse_args()

-    env_vars = get_setup_env_vars(args.setup, args.suite)
+    config = get_setup_config(args.setup, args.suite)

    if args.format == "bash":
-        # Output as bash export statements
-        for key, value in env_vars.items():
+        # Output env vars as bash export statements
+        for key, value in config["env"].items():
            print(f"export {key}='{value}'")
+        # Output defaults as bash export statements with LLAMA_STACK_TEST_ prefix
+        for key, value in config["defaults"].items():
+            env_key = f"LLAMA_STACK_TEST_{key.upper()}"
+            print(f"export {env_key}='{value}'")
    elif args.format == "json":
        import json

-        print(json.dumps(env_vars))
+        print(json.dumps(config))


 if __name__ == "__main__":
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -640,7 +640,7 @@ cmd=( run -d "${PLATFORM_OPTS[@]}" --name llama-stack \
      --network llama-net \
      -p "${PORT}:${PORT}" \
      "${server_env_opts[@]}" \
-      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}" \
+      -e OLLAMA_URL="http://ollama-server:${OLLAMA_PORT}/v1" \
      "${SERVER_IMAGE}" --port "${PORT}")

 log "🦙 Starting Llama Stack..."
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -181,6 +181,10 @@ echo "$SETUP_ENV"
 eval "$SETUP_ENV"
 echo ""

+# Export suite and setup names for TypeScript tests
+export LLAMA_STACK_TEST_SUITE="$TEST_SUITE"
+export LLAMA_STACK_TEST_SETUP="$TEST_SETUP"
+
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR

@ -212,6 +216,71 @@ find_available_port() {
    return 1
 }

+run_client_ts_tests() {
+    if ! command -v npm &>/dev/null; then
+        echo "npm could not be found; ensure Node.js is installed"
+        return 1
+    fi
+
+    pushd tests/integration/client-typescript >/dev/null
+
+    # Determine if TS_CLIENT_PATH is a directory path or an npm version
+    if [[ -d "$TS_CLIENT_PATH" ]]; then
+        # It's a directory path - use local checkout
+        if [[ ! -f "$TS_CLIENT_PATH/package.json" ]]; then
+            echo "Error: $TS_CLIENT_PATH exists but doesn't look like llama-stack-client-typescript (no package.json)"
+            popd >/dev/null
+            return 1
+        fi
+        echo "Using local llama-stack-client-typescript from: $TS_CLIENT_PATH"
+
+        # Build the TypeScript client first
+        echo "Building TypeScript client..."
+        pushd "$TS_CLIENT_PATH" >/dev/null
+        npm install --silent
+        npm run build --silent
+        popd >/dev/null
+
+        # Install other dependencies first
+        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
+            npm ci --silent
+        else
+            npm install --silent
+        fi
+
+        # Then install the client from local directory
+        echo "Installing llama-stack-client from: $TS_CLIENT_PATH"
+        npm install "$TS_CLIENT_PATH" --silent
+    else
+        # It's an npm version specifier - install from npm
+        echo "Installing llama-stack-client@${TS_CLIENT_PATH} from npm"
+        if [[ "${CI:-}" == "true" || "${CI:-}" == "1" ]]; then
+            npm ci --silent
+            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
+        else
+            npm install "llama-stack-client@${TS_CLIENT_PATH}" --silent
+        fi
+    fi
+
+    # Verify installation
+    echo "Verifying llama-stack-client installation..."
+    if npm list llama-stack-client 2>/dev/null | grep -q llama-stack-client; then
+        echo "✅ llama-stack-client successfully installed"
+        npm list llama-stack-client
+    else
+        echo "❌ llama-stack-client not found in node_modules"
+        echo "Installed packages:"
+        npm list --depth=0
+        popd >/dev/null
+        return 1
+    fi
+
+    echo "Running TypeScript tests for suite $TEST_SUITE (setup $TEST_SETUP)"
+    npm test
+
+    popd >/dev/null
+}
+
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
    # Find an available port for the server
@ -221,6 +290,7 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
+    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"

    stop_server() {
@ -298,6 +368,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
        exit 1
    fi
    export LLAMA_STACK_PORT
+    export TEST_API_BASE_URL="http://localhost:$LLAMA_STACK_PORT"
    echo "Will use port: $LLAMA_STACK_PORT"

    echo "=== Building Docker Image for distribution: $DISTRO ==="
@ -506,5 +577,10 @@ else
    exit 1
 fi

+# Run TypeScript client tests if TS_CLIENT_PATH is set
+if [[ $exit_code -eq 0 && -n "${TS_CLIENT_PATH:-}" && "${LLAMA_STACK_TEST_STACK_CONFIG_TYPE:-}" == "server" ]]; then
+    run_client_ts_tests
+fi
+
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -16,9 +16,8 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/nvidia/run.yaml
+++ b/src/llama_stack/distributions/nvidia/run.yaml
@ -16,9 +16,8 @@ providers:
  - provider_id: nvidia
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
--- a/src/llama_stack/distributions/open-benchmark/run.yaml
+++ b/src/llama_stack/distributions/open-benchmark/run.yaml
@ -27,12 +27,12 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  vector_io:
  - provider_id: sqlite-vec
--- a/src/llama_stack/distributions/postgres-demo/run.yaml
+++ b/src/llama_stack/distributions/postgres-demo/run.yaml
@ -11,7 +11,7 @@ providers:
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=http://localhost:8000/v1}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -17,32 +17,32 @@ providers:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
-      base_url: https://api.cerebras.ai
+      base_url: https://api.cerebras.ai/v1
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
-      url: ${env.OLLAMA_URL:=http://localhost:11434}
+      base_url: ${env.OLLAMA_URL:=http://localhost:11434/v1}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
-      url: ${env.VLLM_URL:=}
+      base_url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
-      url: ${env.TGI_URL:=}
+      base_url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
-      url: https://api.fireworks.ai/inference/v1
+      base_url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
-      url: https://api.together.xyz/v1
+      base_url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
@ -52,9 +52,8 @@ providers:
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
-      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      base_url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}
      api_key: ${env.NVIDIA_API_KEY:=}
-      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
@ -76,18 +75,18 @@ providers:
  - provider_id: groq
    provider_type: remote::groq
    config:
-      url: https://api.groq.com
+      base_url: https://api.groq.com/openai/v1
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
-      url: https://api.sambanova.ai/v1
+      base_url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
-      api_base: ${env.AZURE_API_BASE:=}
+      base_url: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
--- a/src/llama_stack/distributions/watsonx/run.yaml
+++ b/src/llama_stack/distributions/watsonx/run.yaml
@ -15,7 +15,7 @@ providers:
  - provider_id: watsonx
    provider_type: remote::watsonx
    config:
-      url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
+      base_url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
      api_key: ${env.WATSONX_API_KEY:=}
      project_id: ${env.WATSONX_PROJECT_ID:=}
  vector_io:
--- a/src/llama_stack/providers/inline/agents/meta_reference/init.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/init.py
@ -23,7 +23,7 @@ async def get_provider_impl(
        config,
        deps[Api.inference],
        deps[Api.vector_io],
-        deps[Api.safety],
+        deps.get(Api.safety),
        deps[Api.tool_runtime],
        deps[Api.tool_groups],
        deps[Api.conversations],
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -41,7 +41,7 @@ class MetaReferenceAgentsImpl(Agents):
        config: MetaReferenceAgentsImplConfig,
        inference_api: Inference,
        vector_io_api: VectorIO,
-        safety_api: Safety,
+        safety_api: Safety | None,
        tool_runtime_api: ToolRuntime,
        tool_groups_api: ToolGroups,
        conversations_api: Conversations,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -67,7 +67,7 @@ class OpenAIResponsesImpl:
        tool_runtime_api: ToolRuntime,
        responses_store: ResponsesStore,
        vector_io_api: VectorIO,  # VectorIO
-        safety_api: Safety,
+        safety_api: Safety | None,
        conversations_api: Conversations,
    ):
        self.inference_api = inference_api
@ -273,6 +273,14 @@ class OpenAIResponsesImpl:

        guardrail_ids = extract_guardrail_ids(guardrails) if guardrails else []

+        # Validate that Safety API is available if guardrails are requested
+        if guardrail_ids and self.safety_api is None:
+            raise ValueError(
+                "Cannot process guardrails: Safety API is not configured.\n\n"
+                "To use guardrails, ensure the Safety API is configured in your stack, or remove "
+                "the 'guardrails' parameter from your request."
+            )
+
        if conversation is not None:
            if previous_response_id is not None:
                raise ValueError(
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -66,6 +66,7 @@ from llama_stack_api import (
    OpenAIResponseUsage,
    OpenAIResponseUsageInputTokensDetails,
    OpenAIResponseUsageOutputTokensDetails,
+    Safety,
    WebSearchToolTypes,
 )

@ -111,7 +112,7 @@ class StreamingResponseOrchestrator:
        max_infer_iters: int,
        tool_executor,  # Will be the tool execution logic from the main class
        instructions: str | None,
-        safety_api,
+        safety_api: Safety | None,
        guardrail_ids: list[str] | None = None,
        prompt: OpenAIResponsePrompt | None = None,
        parallel_tool_calls: bool | None = None,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@ -320,11 +320,15 @@ def is_function_tool_call(
    return False


-async def run_guardrails(safety_api: Safety, messages: str, guardrail_ids: list[str]) -> str | None:
+async def run_guardrails(safety_api: Safety | None, messages: str, guardrail_ids: list[str]) -> str | None:
    """Run guardrails against messages and return violation message if blocked."""
    if not messages:
        return None

+    # If safety API is not available, skip guardrails
+    if safety_api is None:
+        return None
+
    # Look up shields to get their provider_resource_id (actual model ID)
    model_ids = []
    # TODO: list_shields not in Safety interface but available at runtime via API routing
--- a/src/llama_stack/providers/registry/agents.py
+++ b/src/llama_stack/providers/registry/agents.py
@ -30,12 +30,14 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
            api_dependencies=[
                Api.inference,
-                Api.safety,
                Api.vector_io,
                Api.tool_runtime,
                Api.tool_groups,
                Api.conversations,
            ],
+            optional_api_dependencies=[
+                Api.safety,
+            ],
            description="Meta's reference implementation of an agent system that can use tools, access vector databases, and perform complex reasoning tasks.",
        ),
    ]
--- a/src/llama_stack/providers/remote/inference/azure/azure.py
+++ b/src/llama_stack/providers/remote/inference/azure/azure.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from urllib.parse import urljoin
-
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import AzureConfig
@ -22,4 +20,4 @@ class AzureInferenceAdapter(OpenAIMixin):

        Returns the Azure API base URL from the configuration.
        """
-        return urljoin(str(self.config.api_base), "/openai/v1")
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/azure/config.py
+++ b/src/llama_stack/providers/remote/inference/azure/config.py
@ -32,8 +32,9 @@ class AzureProviderDataValidator(BaseModel):

@json_schema_type
 class AzureConfig(RemoteInferenceProviderConfig):
-    api_base: HttpUrl = Field(
-        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
+    base_url: HttpUrl | None = Field(
+        default=None,
+        description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com/openai/v1)",
    )
    api_version: str | None = Field(
        default_factory=lambda: os.getenv("AZURE_API_VERSION"),
@ -48,14 +49,14 @@ class AzureConfig(RemoteInferenceProviderConfig):
    def sample_run_config(
        cls,
        api_key: str = "${env.AZURE_API_KEY:=}",
-        api_base: str = "${env.AZURE_API_BASE:=}",
+        base_url: str = "${env.AZURE_API_BASE:=}",
        api_version: str = "${env.AZURE_API_VERSION:=}",
        api_type: str = "${env.AZURE_API_TYPE:=}",
        **kwargs,
    ) -> dict[str, Any]:
        return {
            "api_key": api_key,
-            "api_base": api_base,
+            "base_url": base_url,
            "api_version": api_version,
            "api_type": api_type,
        }
--- a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from urllib.parse import urljoin
-
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack_api import (
    OpenAIEmbeddingsRequestWithExtraBody,
@ -21,7 +19,7 @@ class CerebrasInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "cerebras_api_key"

    def get_base_url(self) -> str:
-        return urljoin(self.config.base_url, "v1")
+        return str(self.config.base_url)

    async def openai_embeddings(
        self,
--- a/src/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/config.py
@ -7,12 +7,12 @@
 import os
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type

-DEFAULT_BASE_URL = "https://api.cerebras.ai"
+DEFAULT_BASE_URL = "https://api.cerebras.ai/v1"


 class CerebrasProviderDataValidator(BaseModel):
@ -24,8 +24,8 @@ class CerebrasProviderDataValidator(BaseModel):

@json_schema_type
 class CerebrasImplConfig(RemoteInferenceProviderConfig):
-    base_url: str = Field(
-        default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL),
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl(os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL)),
        description="Base URL for the Cerebras API",
    )

--- a/src/llama_stack/providers/remote/inference/databricks/config.py
+++ b/src/llama_stack/providers/remote/inference/databricks/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,9 +21,9 @@ class DatabricksProviderDataValidator(BaseModel):

@json_schema_type
 class DatabricksImplConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
-        description="The URL for the Databricks model serving endpoint",
+        description="The URL for the Databricks model serving endpoint (should include /serving-endpoints path)",
    )
    auth_credential: SecretStr | None = Field(
        default=None,
@ -34,11 +34,11 @@ class DatabricksImplConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.DATABRICKS_HOST:=}",
+        base_url: str = "${env.DATABRICKS_HOST:=}",
        api_token: str = "${env.DATABRICKS_TOKEN:=}",
        **kwargs: Any,
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_token": api_token,
        }
--- a/src/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py
@ -29,15 +29,21 @@ class DatabricksInferenceAdapter(OpenAIMixin):
    }

    def get_base_url(self) -> str:
-        return f"{self.config.url}/serving-endpoints"
+        return str(self.config.base_url)

    async def list_provider_model_ids(self) -> Iterable[str]:
        # Filter out None values from endpoint names
        api_token = self._get_api_key_from_config_or_provider_data()
+        # WorkspaceClient expects base host without /serving-endpoints suffix
+        base_url_str = str(self.config.base_url)
+        if base_url_str.endswith("/serving-endpoints"):
+            host = base_url_str[:-18]  # Remove '/serving-endpoints'
+        else:
+            host = base_url_str
        return [
            endpoint.name  # type: ignore[misc]
            for endpoint in WorkspaceClient(
-                host=self.config.url, token=api_token
+                host=host, token=api_token
            ).serving_endpoints.list()  # TODO: this is not async
        ]

--- a/src/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/src/llama_stack/providers/remote/inference/fireworks/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import Field
+from pydantic import Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type

@json_schema_type
 class FireworksImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
-        default="https://api.fireworks.ai/inference/v1",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.fireworks.ai/inference/v1"),
        description="The URL for the Fireworks server",
    )

    @classmethod
    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.fireworks.ai/inference/v1",
+            "base_url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/src/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -24,4 +24,4 @@ class FireworksInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "fireworks_api_key"

    def get_base_url(self) -> str:
-        return "https://api.fireworks.ai/inference/v1"
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/groq/config.py
+++ b/src/llama_stack/providers/remote/inference/groq/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class GroqProviderDataValidator(BaseModel):

@json_schema_type
 class GroqConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
-        default="https://api.groq.com",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.groq.com/openai/v1"),
        description="The URL for the Groq AI server",
    )

    @classmethod
    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.groq.com",
+            "base_url": "https://api.groq.com/openai/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/groq/groq.py
+++ b/src/llama_stack/providers/remote/inference/groq/groq.py
@ -15,4 +15,4 @@ class GroqInferenceAdapter(OpenAIMixin):
    provider_data_api_key_field: str = "groq_api_key"

    def get_base_url(self) -> str:
-        return f"{self.config.url}/openai/v1"
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class LlamaProviderDataValidator(BaseModel):

@json_schema_type
 class LlamaCompatConfig(RemoteInferenceProviderConfig):
-    openai_compat_api_base: str = Field(
-        default="https://api.llama.com/compat/v1/",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.llama.com/compat/v1/"),
        description="The URL for the Llama API server",
    )

    @classmethod
    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
-            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
+            "base_url": "https://api.llama.com/compat/v1/",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -31,7 +31,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin):

        :return: The Llama API base URL
        """
-        return self.config.openai_compat_api_base
+        return str(self.config.base_url)

    async def openai_completion(
        self,
--- a/src/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/config.py
@ -7,7 +7,7 @@
 import os
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -44,18 +44,14 @@ class NVIDIAConfig(RemoteInferenceProviderConfig):
    URL of your running NVIDIA NIM and do not need to set the api_key.
    """

-    url: str = Field(
-        default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"),
+    base_url: HttpUrl | None = Field(
+        default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com/v1"),
        description="A base url for accessing the NVIDIA NIM",
    )
    timeout: int = Field(
        default=60,
        description="Timeout for the HTTP requests",
    )
-    append_api_version: bool = Field(
-        default_factory=lambda: os.getenv("NVIDIA_APPEND_API_VERSION", "True").lower() != "false",
-        description="When set to false, the API version will not be appended to the base_url. By default, it is true.",
-    )
    rerank_model_to_url: dict[str, str] = Field(
        default_factory=lambda: {
            "nv-rerank-qa-mistral-4b:1": "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking",
@ -68,13 +64,11 @@ class NVIDIAConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}",
+        base_url: HttpUrl | None = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com/v1}",
        api_key: str = "${env.NVIDIA_API_KEY:=}",
-        append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}",
        **kwargs,
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_key": api_key,
-            "append_api_version": append_api_version,
        }
--- a/src/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -44,7 +44,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin):
    }

    async def initialize(self) -> None:
-        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")
+        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.base_url})...")

        if _is_nvidia_hosted(self.config):
            if not self.config.auth_credential:
@ -72,7 +72,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin):

        :return: The NVIDIA API base URL
        """
-        return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url
+        return str(self.config.base_url)

    async def list_provider_model_ids(self) -> Iterable[str]:
        """
--- a/src/llama_stack/providers/remote/inference/nvidia/utils.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/utils.py
@ -8,4 +8,4 @@ from . import NVIDIAConfig


 def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:
-    return "integrate.api.nvidia.com" in config.url
+    return "integrate.api.nvidia.com" in str(config.base_url)
--- a/src/llama_stack/providers/remote/inference/ollama/config.py
+++ b/src/llama_stack/providers/remote/inference/ollama/config.py
@ -6,20 +6,22 @@

 from typing import Any

-from pydantic import Field, SecretStr
+from pydantic import Field, HttpUrl, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig

-DEFAULT_OLLAMA_URL = "http://localhost:11434"
+DEFAULT_OLLAMA_URL = "http://localhost:11434/v1"


 class OllamaImplConfig(RemoteInferenceProviderConfig):
    auth_credential: SecretStr | None = Field(default=None, exclude=True)

-    url: str = DEFAULT_OLLAMA_URL
+    base_url: HttpUrl | None = Field(default=HttpUrl(DEFAULT_OLLAMA_URL))

    @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(
+        cls, base_url: str = "${env.OLLAMA_URL:=http://localhost:11434/v1}", **kwargs
+    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
        }
--- a/src/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/src/llama_stack/providers/remote/inference/ollama/ollama.py
@ -55,17 +55,23 @@ class OllamaInferenceAdapter(OpenAIMixin):
        # ollama client attaches itself to the current event loop (sadly?)
        loop = asyncio.get_running_loop()
        if loop not in self._clients:
-            self._clients[loop] = AsyncOllamaClient(host=self.config.url)
+            # Ollama client expects base URL without /v1 suffix
+            base_url_str = str(self.config.base_url)
+            if base_url_str.endswith("/v1"):
+                host = base_url_str[:-3]
+            else:
+                host = base_url_str
+            self._clients[loop] = AsyncOllamaClient(host=host)
        return self._clients[loop]

    def get_api_key(self):
        return "NO KEY REQUIRED"

    def get_base_url(self):
-        return self.config.url.rstrip("/") + "/v1"
+        return str(self.config.base_url)

    async def initialize(self) -> None:
-        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
+        logger.info(f"checking connectivity to Ollama at `{self.config.base_url}`...")
        r = await self.health()
        if r["status"] == HealthStatus.ERROR:
            logger.warning(
--- a/src/llama_stack/providers/remote/inference/openai/config.py
+++ b/src/llama_stack/providers/remote/inference/openai/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,8 +21,8 @@ class OpenAIProviderDataValidator(BaseModel):

@json_schema_type
 class OpenAIConfig(RemoteInferenceProviderConfig):
-    base_url: str = Field(
-        default="https://api.openai.com/v1",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.openai.com/v1"),
        description="Base URL for OpenAI API",
    )

--- a/src/llama_stack/providers/remote/inference/openai/openai.py
+++ b/src/llama_stack/providers/remote/inference/openai/openai.py
@ -35,4 +35,4 @@ class OpenAIInferenceAdapter(OpenAIMixin):

        Returns the OpenAI API base URL from the configuration.
        """
-        return self.config.base_url
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import Field
+from pydantic import Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,16 +14,16 @@ from llama_stack_api import json_schema_type

@json_schema_type
 class PassthroughImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the passthrough endpoint",
    )

    @classmethod
    def sample_run_config(
-        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
+        cls, base_url: HttpUrl | None = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
    ) -> dict[str, Any]:
        return {
-            "url": url,
+            "base_url": base_url,
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -82,8 +82,8 @@ class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference):

    def _get_passthrough_url(self) -> str:
        """Get the passthrough URL from config or provider data."""
-        if self.config.url is not None:
-            return self.config.url
+        if self.config.base_url is not None:
+            return str(self.config.base_url)

        provider_data = self.get_request_provider_data()
        if provider_data is None:
--- a/src/llama_stack/providers/remote/inference/runpod/config.py
+++ b/src/llama_stack/providers/remote/inference/runpod/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,7 +21,7 @@ class RunpodProviderDataValidator(BaseModel):

@json_schema_type
 class RunpodImplConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the Runpod model serving endpoint",
    )
@ -34,6 +34,6 @@ class RunpodImplConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
        return {
-            "url": "${env.RUNPOD_URL:=}",
+            "base_url": "${env.RUNPOD_URL:=}",
            "api_token": "${env.RUNPOD_API_TOKEN}",
        }
--- a/src/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/src/llama_stack/providers/remote/inference/runpod/runpod.py
@ -28,7 +28,7 @@ class RunpodInferenceAdapter(OpenAIMixin):

    def get_base_url(self) -> str:
        """Get base URL for OpenAI client."""
-        return self.config.url
+        return str(self.config.base_url)

    async def openai_chat_completion(
        self,
--- a/src/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/src/llama_stack/providers/remote/inference/sambanova/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -21,14 +21,14 @@ class SambaNovaProviderDataValidator(BaseModel):

@json_schema_type
 class SambaNovaImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
-        default="https://api.sambanova.ai/v1",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.sambanova.ai/v1"),
        description="The URL for the SambaNova AI server",
    )

    @classmethod
    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.sambanova.ai/v1",
+            "base_url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
        }
--- a/src/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/src/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -25,4 +25,4 @@ class SambaNovaInferenceAdapter(OpenAIMixin):

        :return: The SambaNova base URL
        """
-        return self.config.url
+        return str(self.config.base_url)
--- a/src/llama_stack/providers/remote/inference/tgi/config.py
+++ b/src/llama_stack/providers/remote/inference/tgi/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from pydantic import BaseModel, Field, SecretStr
+from pydantic import BaseModel, Field, HttpUrl, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -15,18 +15,19 @@ from llama_stack_api import json_schema_type
 class TGIImplConfig(RemoteInferenceProviderConfig):
    auth_credential: SecretStr | None = Field(default=None, exclude=True)

-    url: str = Field(
-        description="The URL for the TGI serving endpoint",
+    base_url: HttpUrl | None = Field(
+        default=None,
+        description="The URL for the TGI serving endpoint (should include /v1 path)",
    )

    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.TGI_URL:=}",
+        base_url: str = "${env.TGI_URL:=}",
        **kwargs,
    ):
        return {
-            "url": url,
+            "base_url": base_url,
        }


--- a/src/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/src/llama_stack/providers/remote/inference/tgi/tgi.py
@ -8,7 +8,7 @@
 from collections.abc import Iterable

 from huggingface_hub import AsyncInferenceClient, HfApi
-from pydantic import SecretStr
+from pydantic import HttpUrl, SecretStr

 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -23,7 +23,7 @@ log = get_logger(name=__name__, category="inference::tgi")


 class _HfAdapter(OpenAIMixin):
-    url: str
+    base_url: HttpUrl
    api_key: SecretStr

    hf_client: AsyncInferenceClient
@ -36,7 +36,7 @@ class _HfAdapter(OpenAIMixin):
        return "NO KEY REQUIRED"

    def get_base_url(self):
-        return self.url
+        return self.base_url

    async def list_provider_model_ids(self) -> Iterable[str]:
        return [self.model_id]
@ -50,14 +50,20 @@ class _HfAdapter(OpenAIMixin):

 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
-        if not config.url:
+        if not config.base_url:
            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
-        log.info(f"Initializing TGI client with url={config.url}")
-        self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference")
+        log.info(f"Initializing TGI client with url={config.base_url}")
+        # Extract base URL without /v1 for HF client initialization
+        base_url_str = str(config.base_url).rstrip("/")
+        if base_url_str.endswith("/v1"):
+            base_url_for_client = base_url_str[:-3]
+        else:
+            base_url_for_client = base_url_str
+        self.hf_client = AsyncInferenceClient(model=base_url_for_client, provider="hf-inference")
        endpoint_info = await self.hf_client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]
-        self.url = f"{config.url.rstrip('/')}/v1"
+        self.base_url = config.base_url
        self.api_key = SecretStr("NO_KEY")


--- a/src/llama_stack/providers/remote/inference/together/config.py
+++ b/src/llama_stack/providers/remote/inference/together/config.py
@ -6,7 +6,7 @@

 from typing import Any

-from pydantic import Field
+from pydantic import Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,14 +14,14 @@ from llama_stack_api import json_schema_type

@json_schema_type
 class TogetherImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
-        default="https://api.together.xyz/v1",
+    base_url: HttpUrl | None = Field(
+        default=HttpUrl("https://api.together.xyz/v1"),
        description="The URL for the Together AI server",
    )

    @classmethod
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
-            "url": "https://api.together.xyz/v1",
+            "base_url": "https://api.together.xyz/v1",
            "api_key": "${env.TOGETHER_API_KEY:=}",
        }
--- a/src/llama_stack/providers/remote/inference/together/together.py
+++ b/src/llama_stack/providers/remote/inference/together/together.py
@ -9,7 +9,6 @@ from collections.abc import Iterable
 from typing import Any, cast

 from together import AsyncTogether  # type: ignore[import-untyped]
-from together.constants import BASE_URL  # type: ignore[import-untyped]

 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
@ -42,7 +41,7 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
    provider_data_api_key_field: str = "together_api_key"

    def get_base_url(self):
-        return BASE_URL
+        return str(self.config.base_url)

    def _get_client(self) -> AsyncTogether:
        together_api_key = None
--- a/src/llama_stack/providers/remote/inference/vllm/config.py
+++ b/src/llama_stack/providers/remote/inference/vllm/config.py
@ -6,7 +6,7 @@

 from pathlib import Path

-from pydantic import Field, SecretStr, field_validator
+from pydantic import Field, HttpUrl, SecretStr, field_validator

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -14,7 +14,7 @@ from llama_stack_api import json_schema_type

@json_schema_type
 class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
-    url: str | None = Field(
+    base_url: HttpUrl | None = Field(
        default=None,
        description="The URL for the vLLM model serving endpoint",
    )
@ -48,11 +48,11 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:=}",
+        base_url: str = "${env.VLLM_URL:=}",
        **kwargs,
    ):
        return {
-            "url": url,
+            "base_url": base_url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:=4096}",
            "api_token": "${env.VLLM_API_TOKEN:=fake}",
            "tls_verify": "${env.VLLM_TLS_VERIFY:=true}",
--- a/src/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/src/llama_stack/providers/remote/inference/vllm/vllm.py
@ -39,12 +39,12 @@ class VLLMInferenceAdapter(OpenAIMixin):

    def get_base_url(self) -> str:
        """Get the base URL from config."""
-        if not self.config.url:
+        if not self.config.base_url:
            raise ValueError("No base URL configured")
-        return self.config.url
+        return str(self.config.base_url)

    async def initialize(self) -> None:
-        if not self.config.url:
+        if not self.config.base_url:
            raise ValueError(
                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
            )
--- a/src/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/config.py
@ -7,7 +7,7 @@
 import os
 from typing import Any

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack_api import json_schema_type
@ -23,7 +23,7 @@ class WatsonXProviderDataValidator(BaseModel):

@json_schema_type
 class WatsonXConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    base_url: HttpUrl | None = Field(
        default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
        description="A base url for accessing the watsonx.ai",
    )
@ -39,7 +39,7 @@ class WatsonXConfig(RemoteInferenceProviderConfig):
    @classmethod
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
-            "url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
+            "base_url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
            "api_key": "${env.WATSONX_API_KEY:=}",
            "project_id": "${env.WATSONX_PROJECT_ID:=}",
        }
--- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -255,7 +255,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
        )

    def get_base_url(self) -> str:
-        return self.config.url
+        return str(self.config.base_url)

    # Copied from OpenAIMixin
    async def check_model_availability(self, model: str) -> bool:
@ -316,7 +316,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
        """
        Retrieves foundation model specifications from the watsonx.ai API.
        """
-        url = f"{self.config.url}/ml/v1/foundation_model_specs?version=2023-10-25"
+        url = f"{str(self.config.base_url)}/ml/v1/foundation_model_specs?version=2023-10-25"
        headers = {
            # Note that there is no authorization header.  Listing models does not require authentication.
            "Content-Type": "application/json",
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -211,3 +211,23 @@ def test_asymmetric_embeddings(llama_stack_client, embedding_model_id):

    assert query_response.embeddings is not None
 ```
+
+## TypeScript Client Replays
+
+TypeScript SDK tests can run alongside Python tests when testing against `server:<config>` stacks. Set `TS_CLIENT_PATH` to the path or version of `llama-stack-client-typescript` to enable:
+
+```bash
+# Use published npm package (responses suite)
+TS_CLIENT_PATH=^0.3.2 scripts/integration-tests.sh --stack-config server:ci-tests --suite responses --setup gpt
+
+# Use local checkout from ~/.cache (recommended for development)
+git clone https://github.com/llamastack/llama-stack-client-typescript.git ~/.cache/llama-stack-client-typescript
+TS_CLIENT_PATH=~/.cache/llama-stack-client-typescript scripts/integration-tests.sh --stack-config server:ci-tests --suite responses --setup gpt
+
+# Run base suite with TypeScript tests
+TS_CLIENT_PATH=~/.cache/llama-stack-client-typescript scripts/integration-tests.sh --stack-config server:ci-tests --suite base --setup ollama
+```
+
+TypeScript tests run immediately after Python tests pass, using the same replay fixtures. The mapping between Python suites/setups and TypeScript test files is defined in `tests/integration/client-typescript/suites.json`.
+
+If `TS_CLIENT_PATH` is unset, TypeScript tests are skipped entirely.
--- a/tests/integration/client-typescript/tests/inference.test.ts
+++ b/tests/integration/client-typescript/tests/inference.test.ts
@ -0,0 +1,104 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the terms described in the LICENSE file in
+// the root directory of this source tree.
+
+/**
+ * Integration tests for Inference API (Chat Completions).
+ * Ported from: llama-stack/tests/integration/inference/test_openai_completion.py
+ *
+ * IMPORTANT: Test cases must match EXACTLY with Python tests to use recorded API responses.
+ */
+
+import { createTestClient, requireTextModel } from '../setup';
+
+describe('Inference API - Chat Completions', () => {
+  // Test cases matching llama-stack/tests/integration/test_cases/inference/chat_completion.json
+  const chatCompletionTestCases = [
+    {
+      id: 'non_streaming_01',
+      question: 'Which planet do humans live on?',
+      expected: 'earth',
+      testId:
+        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_01]',
+    },
+    {
+      id: 'non_streaming_02',
+      question: 'Which planet has rings around it with a name starting with letter S?',
+      expected: 'saturn',
+      testId:
+        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_02]',
+    },
+  ];
+
+  const streamingTestCases = [
+    {
+      id: 'streaming_01',
+      question: "What's the name of the Sun in latin?",
+      expected: 'sol',
+      testId:
+        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_01]',
+    },
+    {
+      id: 'streaming_02',
+      question: 'What is the name of the US captial?',
+      expected: 'washington',
+      testId:
+        'tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_02]',
+    },
+  ];
+
+  test.each(chatCompletionTestCases)(
+    'chat completion non-streaming: $id',
+    async ({ question, expected, testId }) => {
+      const client = createTestClient(testId);
+      const textModel = requireTextModel();
+
+      const response = await client.chat.completions.create({
+        model: textModel,
+        messages: [
+          {
+            role: 'user',
+            content: question,
+          },
+        ],
+        stream: false,
+      });
+
+      // Non-streaming responses have choices with message property
+      const choice = response.choices[0];
+      expect(choice).toBeDefined();
+      if (!choice || !('message' in choice)) {
+        throw new Error('Expected non-streaming response with message');
+      }
+      const content = choice.message.content;
+      expect(content).toBeDefined();
+      const messageContent = typeof content === 'string' ? content.toLowerCase().trim() : '';
+      expect(messageContent.length).toBeGreaterThan(0);
+      expect(messageContent).toContain(expected.toLowerCase());
+    },
+  );
+
+  test.each(streamingTestCases)('chat completion streaming: $id', async ({ question, expected, testId }) => {
+    const client = createTestClient(testId);
+    const textModel = requireTextModel();
+
+    const stream = await client.chat.completions.create({
+      model: textModel,
+      messages: [{ role: 'user', content: question }],
+      stream: true,
+    });
+
+    const streamedContent: string[] = [];
+    for await (const chunk of stream) {
+      if (chunk.choices && chunk.choices.length > 0 && chunk.choices[0]?.delta?.content) {
+        streamedContent.push(chunk.choices[0].delta.content);
+      }
+    }
+
+    expect(streamedContent.length).toBeGreaterThan(0);
+    const fullContent = streamedContent.join('').toLowerCase().trim();
+    expect(fullContent).toContain(expected.toLowerCase());
+  });
+});
--- a/tests/integration/client-typescript/tests/responses.test.ts
+++ b/tests/integration/client-typescript/tests/responses.test.ts
@ -0,0 +1,132 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the terms described in the LICENSE file in
+// the root directory of this source tree.
+
+/**
+ * Integration tests for Responses API.
+ * Ported from: llama-stack/tests/integration/responses/test_basic_responses.py
+ *
+ * IMPORTANT: Test cases and IDs must match EXACTLY with Python tests to use recorded API responses.
+ */
+
+import { createTestClient, requireTextModel, getResponseOutputText } from '../setup';
+
+describe('Responses API - Basic', () => {
+  // Test cases matching llama-stack/tests/integration/responses/fixtures/test_cases.py
+  const basicTestCases = [
+    {
+      id: 'earth',
+      input: 'Which planet do humans live on?',
+      expected: 'earth',
+      // Use client_with_models fixture to match non-streaming recordings
+      testId:
+        'tests/integration/responses/test_basic_responses.py::test_response_non_streaming_basic[client_with_models-txt=openai/gpt-4o-earth]',
+    },
+    {
+      id: 'saturn',
+      input: 'Which planet has rings around it with a name starting with letter S?',
+      expected: 'saturn',
+      testId:
+        'tests/integration/responses/test_basic_responses.py::test_response_non_streaming_basic[client_with_models-txt=openai/gpt-4o-saturn]',
+    },
+  ];
+
+  test.each(basicTestCases)('non-streaming basic response: $id', async ({ input, expected, testId }) => {
+    // Create client with test_id for all requests
+    const client = createTestClient(testId);
+    const textModel = requireTextModel();
+
+    // Create a response
+    const response = await client.responses.create({
+      model: textModel,
+      input,
+      stream: false,
+    });
+
+    // Verify response has content
+    const outputText = getResponseOutputText(response).toLowerCase().trim();
+    expect(outputText.length).toBeGreaterThan(0);
+    expect(outputText).toContain(expected.toLowerCase());
+
+    // Verify usage is reported
+    expect(response.usage).toBeDefined();
+    expect(response.usage!.input_tokens).toBeGreaterThan(0);
+    expect(response.usage!.output_tokens).toBeGreaterThan(0);
+    expect(response.usage!.total_tokens).toBe(response.usage!.input_tokens + response.usage!.output_tokens);
+
+    // Verify stored response matches
+    const retrievedResponse = await client.responses.retrieve(response.id);
+    expect(getResponseOutputText(retrievedResponse)).toBe(getResponseOutputText(response));
+
+    // Test follow-up with previous_response_id
+    const nextResponse = await client.responses.create({
+      model: textModel,
+      input: 'Repeat your previous response in all caps.',
+      previous_response_id: response.id,
+    });
+    const nextOutputText = getResponseOutputText(nextResponse).trim();
+    expect(nextOutputText).toContain(expected.toUpperCase());
+  });
+
+  test.each(basicTestCases)('streaming basic response: $id', async ({ input, expected, testId }) => {
+    // Modify test_id for streaming variant
+    const streamingTestId = testId.replace(
+      'test_response_non_streaming_basic',
+      'test_response_streaming_basic',
+    );
+    const client = createTestClient(streamingTestId);
+    const textModel = requireTextModel();
+
+    // Create a streaming response
+    const stream = await client.responses.create({
+      model: textModel,
+      input,
+      stream: true,
+    });
+
+    const events: any[] = [];
+    let responseId = '';
+
+    for await (const chunk of stream) {
+      events.push(chunk);
+
+      if (chunk.type === 'response.created') {
+        // Verify response.created is the first event
+        expect(events.length).toBe(1);
+        expect(chunk.response.status).toBe('in_progress');
+        responseId = chunk.response.id;
+      } else if (chunk.type === 'response.completed') {
+        // Verify response.completed comes after response.created
+        expect(events.length).toBeGreaterThanOrEqual(2);
+        expect(chunk.response.status).toBe('completed');
+        expect(chunk.response.id).toBe(responseId);
+
+        // Verify content quality
+        const outputText = getResponseOutputText(chunk.response).toLowerCase().trim();
+        expect(outputText.length).toBeGreaterThan(0);
+        expect(outputText).toContain(expected.toLowerCase());
+
+        // Verify usage is reported
+        expect(chunk.response.usage).toBeDefined();
+        expect(chunk.response.usage!.input_tokens).toBeGreaterThan(0);
+        expect(chunk.response.usage!.output_tokens).toBeGreaterThan(0);
+        expect(chunk.response.usage!.total_tokens).toBe(
+          chunk.response.usage!.input_tokens + chunk.response.usage!.output_tokens,
+        );
+      }
+    }
+
+    // Verify we got both events
+    expect(events.length).toBeGreaterThanOrEqual(2);
+    const firstEvent = events[0];
+    const lastEvent = events[events.length - 1];
+    expect(firstEvent.type).toBe('response.created');
+    expect(lastEvent.type).toBe('response.completed');
+
+    // Verify stored response matches streamed response
+    const retrievedResponse = await client.responses.retrieve(responseId);
+    expect(getResponseOutputText(retrievedResponse)).toBe(getResponseOutputText(lastEvent.response));
+  });
+});
--- a/tests/integration/client-typescript/jest.integration.config.js
+++ b/tests/integration/client-typescript/jest.integration.config.js
@ -0,0 +1,31 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the terms described in the LICENSE file in
+// the root directory of this source tree.
+
+/** @type {import('ts-jest').JestConfigWithTsJest} */
+module.exports = {
+  preset: 'ts-jest/presets/default-esm',
+  testEnvironment: 'node',
+  extensionsToTreatAsEsm: ['.ts'],
+  moduleNameMapper: {
+    '^(\\.{1,2}/.*)\\.js$': '$1',
+  },
+  transform: {
+    '^.+\\.tsx?$': [
+      'ts-jest',
+      {
+        useESM: true,
+        tsconfig: {
+          module: 'ES2022',
+          moduleResolution: 'bundler',
+        },
+      },
+    ],
+  },
+  testMatch: ['<rootDir>/__tests__/**/*.test.ts'],
+  setupFilesAfterEnv: ['<rootDir>/setup.ts'],
+  testTimeout: 60000, // 60 seconds (integration tests can be slow)
+  watchman: false, // Disable watchman to avoid permission issues
+};
--- a/tests/integration/client-typescript/package-lock.json
+++ b/tests/integration/client-typescript/package-lock.json
--- a/tests/integration/client-typescript/package.json
+++ b/tests/integration/client-typescript/package.json
@ -0,0 +1,18 @@
+{
+  "name": "llama-stack-typescript-integration-tests",
+  "version": "0.0.1",
+  "private": true,
+  "description": "TypeScript client integration tests for Llama Stack",
+  "scripts": {
+    "test": "node run-tests.js"
+  },
+  "devDependencies": {
+    "@swc/core": "^1.3.102",
+    "@swc/jest": "^0.2.29",
+    "@types/jest": "^29.4.0",
+    "@types/node": "^20.0.0",
+    "jest": "^29.4.0",
+    "ts-jest": "^29.1.0",
+    "typescript": "^5.0.0"
+  }
+}
--- a/tests/integration/client-typescript/run-tests.js
+++ b/tests/integration/client-typescript/run-tests.js
@ -0,0 +1,63 @@
+#!/usr/bin/env node
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the terms described in the LICENSE file in
+// the root directory of this source tree.
+
+/**
+ * Test runner that finds and executes TypeScript tests based on suite/setup mapping.
+ * Called by integration-tests.sh via npm test.
+ */
+
+const fs = require('fs');
+const path = require('path');
+const { execSync } = require('child_process');
+
+const suite = process.env.LLAMA_STACK_TEST_SUITE;
+const setup = process.env.LLAMA_STACK_TEST_SETUP || '';
+
+if (!suite) {
+  console.error('Error: LLAMA_STACK_TEST_SUITE environment variable is required');
+  process.exit(1);
+}
+
+// Read suites.json to find matching test files
+const suitesPath = path.join(__dirname, 'suites.json');
+if (!fs.existsSync(suitesPath)) {
+  console.log(`No TypeScript tests configured (${suitesPath} not found)`);
+  process.exit(0);
+}
+
+const suites = JSON.parse(fs.readFileSync(suitesPath, 'utf-8'));
+
+// Find matching entry
+let testFiles = [];
+for (const entry of suites) {
+  if (entry.suite !== suite) {
+    continue;
+  }
+  const entrySetup = entry.setup || '';
+  if (entrySetup && entrySetup !== setup) {
+    continue;
+  }
+  testFiles = entry.files || [];
+  break;
+}
+
+if (testFiles.length === 0) {
+  console.log(`No TypeScript integration tests mapped for suite ${suite} (setup ${setup})`);
+  process.exit(0);
+}
+
+console.log(`Running TypeScript tests for suite ${suite} (setup ${setup}): ${testFiles.join(', ')}`);
+
+// Run Jest with the mapped test files
+try {
+  execSync(`npx jest --config jest.integration.config.js ${testFiles.join(' ')}`, {
+    stdio: 'inherit',
+    cwd: __dirname,
+  });
+} catch (error) {
+  process.exit(error.status || 1);
+}
--- a/tests/integration/client-typescript/setup.ts
+++ b/tests/integration/client-typescript/setup.ts
@ -0,0 +1,162 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the terms described in the LICENSE file in
+// the root directory of this source tree.
+
+/**
+ * Global setup for integration tests.
+ * This file mimics pytest's fixture system by providing shared test configuration.
+ */
+
+import LlamaStackClient from 'llama-stack-client';
+
+/**
+ * Load test configuration from the Python setup system.
+ * This reads setup definitions from tests/integration/suites.py via get_setup_env.py.
+ */
+function loadTestConfig() {
+  const baseURL = process.env['TEST_API_BASE_URL'];
+  const setupName = process.env['LLAMA_STACK_TEST_SETUP'];
+  const textModel = process.env['LLAMA_STACK_TEST_TEXT_MODEL'];
+  const embeddingModel = process.env['LLAMA_STACK_TEST_EMBEDDING_MODEL'];
+
+  if (!baseURL) {
+    throw new Error(
+      'TEST_API_BASE_URL is required for integration tests. ' +
+        'Run tests using: ./scripts/integration-test.sh',
+    );
+  }
+
+  return {
+    baseURL,
+    textModel,
+    embeddingModel,
+    setupName,
+  };
+}
+
+// Read configuration from environment variables (set by scripts/integration-test.sh)
+export const TEST_CONFIG = loadTestConfig();
+
+// Validate required configuration
+beforeAll(() => {
+  console.log('\n=== Integration Test Configuration ===');
+  console.log(`Base URL: ${TEST_CONFIG.baseURL}`);
+  console.log(`Setup: ${TEST_CONFIG.setupName || 'NOT SET'}`);
+  console.log(
+    `Text Model: ${TEST_CONFIG.textModel || 'NOT SET - tests requiring text model will be skipped'}`,
+  );
+  console.log(
+    `Embedding Model: ${
+      TEST_CONFIG.embeddingModel || 'NOT SET - tests requiring embedding model will be skipped'
+    }`,
+  );
+  console.log('=====================================\n');
+});
+
+/**
+ * Create a client instance for integration tests.
+ * Mimics pytest's `llama_stack_client` fixture.
+ *
+ * @param testId - Test ID to send in X-LlamaStack-Provider-Data header for replay mode.
+ *                 Format: "tests/integration/responses/test_basic_responses.py::test_name[params]"
+ */
+export function createTestClient(testId?: string): LlamaStackClient {
+  const headers: Record<string, string> = {};
+
+  // In server mode with replay, send test ID for recording isolation
+  if (process.env['LLAMA_STACK_TEST_STACK_CONFIG_TYPE'] === 'server' && testId) {
+    headers['X-LlamaStack-Provider-Data'] = JSON.stringify({
+      __test_id: testId,
+    });
+  }
+
+  return new LlamaStackClient({
+    baseURL: TEST_CONFIG.baseURL,
+    timeout: 60000, // 60 seconds
+    defaultHeaders: headers,
+  });
+}
+
+/**
+ * Skip test if required model is not configured.
+ * Mimics pytest's `skip_if_no_model` autouse fixture.
+ */
+export function skipIfNoModel(modelType: 'text' | 'embedding'): typeof test {
+  const model = modelType === 'text' ? TEST_CONFIG.textModel : TEST_CONFIG.embeddingModel;
+
+  if (!model) {
+    const envVar = modelType === 'text' ? 'LLAMA_STACK_TEST_TEXT_MODEL' : 'LLAMA_STACK_TEST_EMBEDDING_MODEL';
+    const message = `Skipping: ${modelType} model not configured (set ${envVar})`;
+    return test.skip.bind(test) as typeof test;
+  }
+
+  return test;
+}
+
+/**
+ * Get the configured text model, throwing if not set.
+ * Use this in tests that absolutely require a text model.
+ */
+export function requireTextModel(): string {
+  if (!TEST_CONFIG.textModel) {
+    throw new Error(
+      'LLAMA_STACK_TEST_TEXT_MODEL environment variable is required. ' +
+        'Run tests using: ./scripts/integration-test.sh',
+    );
+  }
+  return TEST_CONFIG.textModel;
+}
+
+/**
+ * Get the configured embedding model, throwing if not set.
+ * Use this in tests that absolutely require an embedding model.
+ */
+export function requireEmbeddingModel(): string {
+  if (!TEST_CONFIG.embeddingModel) {
+    throw new Error(
+      'LLAMA_STACK_TEST_EMBEDDING_MODEL environment variable is required. ' +
+        'Run tests using: ./scripts/integration-test.sh',
+    );
+  }
+  return TEST_CONFIG.embeddingModel;
+}
+
+/**
+ * Extracts aggregated text output from a ResponseObject.
+ * This concatenates all text content from the response's output array.
+ *
+ * Copied from llama-stack-client's response-helpers until it's available in published version.
+ */
+export function getResponseOutputText(response: any): string {
+  const pieces: string[] = [];
+
+  for (const output of response.output ?? []) {
+    if (!output || output.type !== 'message') {
+      continue;
+    }
+
+    const content = output.content;
+    if (typeof content === 'string') {
+      pieces.push(content);
+      continue;
+    }
+
+    if (!Array.isArray(content)) {
+      continue;
+    }
+
+    for (const item of content) {
+      if (typeof item === 'string') {
+        pieces.push(item);
+        continue;
+      }
+      if (item && item.type === 'output_text' && 'text' in item && typeof item.text === 'string') {
+        pieces.push(item.text);
+      }
+    }
+  }
+
+  return pieces.join('');
+}
--- a/tests/integration/client-typescript/suites.json
+++ b/tests/integration/client-typescript/suites.json
@ -0,0 +1,12 @@
+[
+  {
+    "suite": "responses",
+    "setup": "gpt",
+    "files": ["__tests__/responses.test.ts"]
+  },
+  {
+    "suite": "base",
+    "setup": "ollama",
+    "files": ["__tests__/inference.test.ts"]
+  }
+]
--- a/tests/integration/client-typescript/tsconfig.json
+++ b/tests/integration/client-typescript/tsconfig.json
@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ES2022",
+    "lib": ["ES2022"],
+    "moduleResolution": "bundler",
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true,
+    "types": ["jest", "node"]
+  },
+  "include": ["**/*.ts"],
+  "exclude": ["node_modules"]
+}
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@ -50,7 +50,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
        name="ollama",
        description="Local Ollama provider with text + safety models",
        env={
-            "OLLAMA_URL": "http://0.0.0.0:11434",
+            "OLLAMA_URL": "http://0.0.0.0:11434/v1",
            "SAFETY_MODEL": "ollama/llama-guard3:1b",
        },
        defaults={
@ -64,7 +64,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
        name="ollama",
        description="Local Ollama provider with a vision model",
        env={
-            "OLLAMA_URL": "http://0.0.0.0:11434",
+            "OLLAMA_URL": "http://0.0.0.0:11434/v1",
        },
        defaults={
            "vision_model": "ollama/llama3.2-vision:11b",
@ -75,7 +75,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
        name="ollama-postgres",
        description="Server-mode tests with Postgres-backed persistence",
        env={
-            "OLLAMA_URL": "http://0.0.0.0:11434",
+            "OLLAMA_URL": "http://0.0.0.0:11434/v1",
            "SAFETY_MODEL": "ollama/llama-guard3:1b",
            "POSTGRES_HOST": "127.0.0.1",
            "POSTGRES_PORT": "5432",
--- a/tests/unit/providers/agents/meta_reference/test_safety_optional.py
+++ b/tests/unit/providers/agents/meta_reference/test_safety_optional.py
@ -0,0 +1,206 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""Tests for making Safety API optional in meta-reference agents provider.
+
+This test suite validates the changes introduced to fix issue #4165, which
+allows running the meta-reference agents provider without the Safety API.
+Safety API is now an optional dependency, and errors are raised at request time
+when guardrails are explicitly requested without Safety API configured.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from llama_stack.core.datatypes import Api
+from llama_stack.core.storage.datatypes import KVStoreReference, ResponsesStoreReference
+from llama_stack.providers.inline.agents.meta_reference import get_provider_impl
+from llama_stack.providers.inline.agents.meta_reference.config import (
+    AgentPersistenceConfig,
+    MetaReferenceAgentsImplConfig,
+)
+from llama_stack.providers.inline.agents.meta_reference.responses.utils import (
+    run_guardrails,
+)
+
+
+@pytest.fixture
+def mock_persistence_config():
+    """Create a mock persistence configuration."""
+    return AgentPersistenceConfig(
+        agent_state=KVStoreReference(
+            backend="kv_default",
+            namespace="agents",
+        ),
+        responses=ResponsesStoreReference(
+            backend="sql_default",
+            table_name="responses",
+        ),
+    )
+
+
+@pytest.fixture
+def mock_deps():
+    """Create mock dependencies for the agents provider."""
+    # Create mock APIs
+    inference_api = AsyncMock()
+    vector_io_api = AsyncMock()
+    tool_runtime_api = AsyncMock()
+    tool_groups_api = AsyncMock()
+    conversations_api = AsyncMock()
+
+    return {
+        Api.inference: inference_api,
+        Api.vector_io: vector_io_api,
+        Api.tool_runtime: tool_runtime_api,
+        Api.tool_groups: tool_groups_api,
+        Api.conversations: conversations_api,
+    }
+
+
+class TestProviderInitialization:
+    """Test provider initialization with different safety API configurations."""
+
+    async def test_initialization_with_safety_api_present(self, mock_persistence_config, mock_deps):
+        """Test successful initialization when Safety API is configured."""
+        config = MetaReferenceAgentsImplConfig(persistence=mock_persistence_config)
+
+        # Add safety API to deps
+        safety_api = AsyncMock()
+        mock_deps[Api.safety] = safety_api
+
+        # Mock the initialize method to avoid actual initialization
+        with patch(
+            "llama_stack.providers.inline.agents.meta_reference.agents.MetaReferenceAgentsImpl.initialize",
+            new_callable=AsyncMock,
+        ):
+            # Should not raise any exception
+            provider = await get_provider_impl(config, mock_deps, policy=[], telemetry_enabled=False)
+            assert provider is not None
+
+    async def test_initialization_without_safety_api(self, mock_persistence_config, mock_deps):
+        """Test successful initialization when Safety API is not configured."""
+        config = MetaReferenceAgentsImplConfig(persistence=mock_persistence_config)
+
+        # Safety API is NOT in mock_deps - provider should still start
+        # Mock the initialize method to avoid actual initialization
+        with patch(
+            "llama_stack.providers.inline.agents.meta_reference.agents.MetaReferenceAgentsImpl.initialize",
+            new_callable=AsyncMock,
+        ):
+            # Should not raise any exception
+            provider = await get_provider_impl(config, mock_deps, policy=[], telemetry_enabled=False)
+            assert provider is not None
+            assert provider.safety_api is None
+
+
+class TestGuardrailsFunctionality:
+    """Test run_guardrails function with optional safety API."""
+
+    async def test_run_guardrails_with_none_safety_api(self):
+        """Test that run_guardrails returns None when safety_api is None."""
+        result = await run_guardrails(safety_api=None, messages="test message", guardrail_ids=["llama-guard"])
+        assert result is None
+
+    async def test_run_guardrails_with_empty_messages(self):
+        """Test that run_guardrails returns None for empty messages."""
+        # Test with None safety API
+        result = await run_guardrails(safety_api=None, messages="", guardrail_ids=["llama-guard"])
+        assert result is None
+
+        # Test with mock safety API
+        mock_safety_api = AsyncMock()
+        result = await run_guardrails(safety_api=mock_safety_api, messages="", guardrail_ids=["llama-guard"])
+        assert result is None
+
+    async def test_run_guardrails_with_none_safety_api_ignores_guardrails(self):
+        """Test that guardrails are skipped when safety_api is None, even if guardrail_ids are provided."""
+        # Should not raise exception, just return None
+        result = await run_guardrails(
+            safety_api=None,
+            messages="potentially harmful content",
+            guardrail_ids=["llama-guard", "content-filter"],
+        )
+        assert result is None
+
+    async def test_create_response_rejects_guardrails_without_safety_api(self, mock_persistence_config, mock_deps):
+        """Test that create_openai_response raises error when guardrails requested but Safety API unavailable."""
+        from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
+            OpenAIResponsesImpl,
+        )
+        from llama_stack_api import ResponseGuardrailSpec
+
+        # Create OpenAIResponsesImpl with no safety API
+        with patch("llama_stack.providers.inline.agents.meta_reference.responses.openai_responses.ResponsesStore"):
+            impl = OpenAIResponsesImpl(
+                inference_api=mock_deps[Api.inference],
+                tool_groups_api=mock_deps[Api.tool_groups],
+                tool_runtime_api=mock_deps[Api.tool_runtime],
+                responses_store=MagicMock(),
+                vector_io_api=mock_deps[Api.vector_io],
+                safety_api=None,  # No Safety API
+                conversations_api=mock_deps[Api.conversations],
+            )
+
+            # Test with string guardrail
+            with pytest.raises(ValueError) as exc_info:
+                await impl.create_openai_response(
+                    input="test input",
+                    model="test-model",
+                    guardrails=["llama-guard"],
+                )
+            assert "Cannot process guardrails: Safety API is not configured" in str(exc_info.value)
+
+            # Test with ResponseGuardrailSpec
+            with pytest.raises(ValueError) as exc_info:
+                await impl.create_openai_response(
+                    input="test input",
+                    model="test-model",
+                    guardrails=[ResponseGuardrailSpec(type="llama-guard")],
+                )
+            assert "Cannot process guardrails: Safety API is not configured" in str(exc_info.value)
+
+    async def test_create_response_succeeds_without_guardrails_and_no_safety_api(
+        self, mock_persistence_config, mock_deps
+    ):
+        """Test that create_openai_response works when no guardrails requested and Safety API unavailable."""
+        from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
+            OpenAIResponsesImpl,
+        )
+
+        # Create OpenAIResponsesImpl with no safety API
+        with (
+            patch("llama_stack.providers.inline.agents.meta_reference.responses.openai_responses.ResponsesStore"),
+            patch.object(OpenAIResponsesImpl, "_create_streaming_response", new_callable=AsyncMock) as mock_stream,
+        ):
+            # Mock the streaming response to return a simple async generator
+            async def mock_generator():
+                yield MagicMock()
+
+            mock_stream.return_value = mock_generator()
+
+            impl = OpenAIResponsesImpl(
+                inference_api=mock_deps[Api.inference],
+                tool_groups_api=mock_deps[Api.tool_groups],
+                tool_runtime_api=mock_deps[Api.tool_runtime],
+                responses_store=MagicMock(),
+                vector_io_api=mock_deps[Api.vector_io],
+                safety_api=None,  # No Safety API
+                conversations_api=mock_deps[Api.conversations],
+            )
+
+            # Should not raise when no guardrails requested
+            # Note: This will still fail later in execution due to mocking, but should pass the validation
+            try:
+                await impl.create_openai_response(
+                    input="test input",
+                    model="test-model",
+                    guardrails=None,  # No guardrails
+                )
+            except Exception as e:
+                # Ensure the error is NOT about missing Safety API
+                assert "Cannot process guardrails: Safety API is not configured" not in str(e)
--- a/tests/unit/providers/inference/test_inference_client_caching.py
+++ b/tests/unit/providers/inference/test_inference_client_caching.py
@ -120,7 +120,7 @@ from llama_stack.providers.remote.inference.watsonx.watsonx import WatsonXInfere
            VLLMInferenceAdapter,
            "llama_stack.providers.remote.inference.vllm.VLLMProviderDataValidator",
            {
-                "url": "http://fake",
+                "base_url": "http://fake",
            },
        ),
    ],
@ -153,7 +153,7 @@ def test_litellm_provider_data_used(config_cls, adapter_cls, provider_data_valid
    """Validate data for LiteLLM-based providers.  Similar to test_openai_provider_data_used, but without the
    assumption that there is an OpenAI-compatible client object."""

-    inference_adapter = adapter_cls(config=config_cls())
+    inference_adapter = adapter_cls(config=config_cls(base_url="http://fake"))

    inference_adapter.__provider_spec__ = MagicMock()
    inference_adapter.__provider_spec__.provider_data_validator = provider_data_validator
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -40,7 +40,7 @@ from llama_stack_api import (

@pytest.fixture(scope="function")
 async def vllm_inference_adapter():
-    config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345")
+    config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345")
    inference_adapter = VLLMInferenceAdapter(config=config)
    inference_adapter.model_store = AsyncMock()
    await inference_adapter.initialize()
@ -204,7 +204,7 @@ async def test_vllm_completion_extra_body():
    via extra_body to the underlying OpenAI client through the InferenceRouter.
    """
    # Set up the vLLM adapter
-    config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345")
+    config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345")
    vllm_adapter = VLLMInferenceAdapter(config=config)
    vllm_adapter.__provider_id__ = "vllm"
    await vllm_adapter.initialize()
@ -277,7 +277,7 @@ async def test_vllm_chat_completion_extra_body():
    via extra_body to the underlying OpenAI client through the InferenceRouter for chat completion.
    """
    # Set up the vLLM adapter
-    config = VLLMInferenceAdapterConfig(url="http://mocked.localhost:12345")
+    config = VLLMInferenceAdapterConfig(base_url="http://mocked.localhost:12345")
    vllm_adapter = VLLMInferenceAdapter(config=config)
    vllm_adapter.__provider_id__ = "vllm"
    await vllm_adapter.initialize()
--- a/tests/unit/providers/nvidia/test_rerank_inference.py
+++ b/tests/unit/providers/nvidia/test_rerank_inference.py
@ -146,7 +146,7 @@ async def test_hosted_model_not_in_endpoint_mapping():

 async def test_self_hosted_ignores_endpoint():
    adapter = create_adapter(
-        config=NVIDIAConfig(url="http://localhost:8000", api_key=None),
+        config=NVIDIAConfig(base_url="http://localhost:8000", api_key=None),
        rerank_endpoints={"test-model": "https://model.endpoint/rerank"},  # This should be ignored for self-hosted.
    )
    mock_session = MockSession(MockResponse())
--- a/tests/unit/providers/test_configs.py
+++ b/tests/unit/providers/test_configs.py
@ -4,8 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import get_args, get_origin
+
 import pytest
-from pydantic import BaseModel
+from pydantic import BaseModel, HttpUrl

 from llama_stack.core.distribution import get_provider_registry, providable_apis
 from llama_stack.core.utils.dynamic import instantiate_class_type
@ -41,3 +43,55 @@ class TestProviderConfigurations:

        sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz")
        assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict"
+
+    def test_remote_inference_url_standardization(self):
+        """Verify all remote inference providers use standardized base_url configuration."""
+        provider_registry = get_provider_registry()
+        inference_providers = provider_registry.get("inference", {})
+
+        # Filter for remote providers only
+        remote_providers = {k: v for k, v in inference_providers.items() if k.startswith("remote::")}
+
+        failures = []
+        for provider_type, provider_spec in remote_providers.items():
+            try:
+                config_class_name = provider_spec.config_class
+                config_type = instantiate_class_type(config_class_name)
+
+                # Check that config has base_url field (not url)
+                if hasattr(config_type, "model_fields"):
+                    fields = config_type.model_fields
+
+                    # Should NOT have 'url' field (old pattern)
+                    if "url" in fields:
+                        failures.append(
+                            f"{provider_type}: Uses deprecated 'url' field instead of 'base_url'. "
+                            f"Please rename to 'base_url' for consistency."
+                        )
+
+                    # Should have 'base_url' field with HttpUrl | None type
+                    if "base_url" in fields:
+                        field_info = fields["base_url"]
+                        annotation = field_info.annotation
+
+                        # Check if it's HttpUrl or HttpUrl | None
+                        # get_origin() returns Union for (X | Y), None for plain types
+                        # get_args() returns the types inside Union, e.g. (HttpUrl, NoneType)
+                        is_valid = False
+                        if get_origin(annotation) is not None:  # It's a Union/Optional
+                            if HttpUrl in get_args(annotation):
+                                is_valid = True
+                        elif annotation == HttpUrl:  # Plain HttpUrl without | None
+                            is_valid = True
+
+                        if not is_valid:
+                            failures.append(
+                                f"{provider_type}: base_url field has incorrect type annotation. "
+                                f"Expected 'HttpUrl | None', got '{annotation}'"
+                            )
+
+            except Exception as e:
+                failures.append(f"{provider_type}: Error checking URL standardization: {str(e)}")
+
+        if failures:
+            pytest.fail("URL standardization violations found:\n" + "\n".join(f"  - {f}" for f in failures))