Merge remote-tracking branch 'upstream/main' into strip-telem

2025-12-03 09:53:45 +00:00 · 2025-11-06 09:47:12 -05:00 · 2025-11-06 09:47:12 -05:00 · d00a085aed
commit d00a085aed
parent 2e0af29fca c62a09ab76
91 changed files with 9321 additions and 544 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -72,7 +72,8 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/
-          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
+          git commit -m "Recordings update from CI (setup: ${{ inputs.setup }}, suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
@ -88,6 +89,8 @@ runs:
      run: |
        # Ollama logs (if ollama container exists)
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
        # vllm logs (if vllm container exists)
        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
    - name: Upload logs
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -11,13 +11,14 @@ runs:
          --name vllm \
          -p 8000:8000 \
          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
+          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
+          --tool-call-parser hermes \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
+          --model /root/.cache/Qwen3-0.6B \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+          --served-model-name Qwen/Qwen3-0.6B \
          --max-model-len 8192
          # Wait for vllm to be ready
          echo "Waiting for vllm to be ready..."
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -27,7 +27,6 @@ on:
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -9976,6 +9976,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10001,15 +10065,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10085,70 +10141,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10606,7 +10598,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -163,7 +163,41 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```
-### Via venv
+The container will run the distribution with a SQLite store by default. This store is used for the following components:
 - Metadata store: store metadata about the models, providers, etc.
 - Inference store: collect of responses from the inference provider
 - Agents store: store agent configurations (sessions, turns, etc.)
 - Agents Responses store: store responses from the agents
 However, you can use PostgreSQL instead by running the `starter::run-with-postgres-store.yaml` configuration:
 ```bash
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -e OPENAI_API_KEY=your_openai_key \
  -e FIREWORKS_API_KEY=your_fireworks_key \
  -e TOGETHER_API_KEY=your_together_key \
  -e POSTGRES_HOST=your_postgres_host \
  -e POSTGRES_PORT=your_postgres_port \
  -e POSTGRES_DB=your_postgres_db \
  -e POSTGRES_USER=your_postgres_user \
  -e POSTGRES_PASSWORD=your_postgres_password \
  llamastack/distribution-starter \
  starter::run-with-postgres-store.yaml
 ```
 Postgres environment variables:
 - `POSTGRES_HOST`: Postgres host (default: `localhost`)
 - `POSTGRES_PORT`: Postgres port (default: `5432`)
 - `POSTGRES_DB`: Postgres database name (default: `llamastack`)
 - `POSTGRES_USER`: Postgres username (default: `llamastack`)
 - `POSTGRES_PASSWORD`: Postgres password (default: `llamastack`)
 ### Via Conda or venv
 Ensure you have configured the starter distribution using the environment variables explained above.
@ -171,8 +205,11 @@ Ensure you have configured the starter distribution using the environment variab
 # Install dependencies for the starter distribution
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-# Run the server
+# Run the server (with SQLite - default)
 uv run --with llama-stack llama stack run starter
 # Or run with PostgreSQL
 uv run --with llama-stack llama stack run starter::run-with-postgres-store.yaml
 ```
 ## Example Usage
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -16,7 +16,7 @@ Passthrough inference provider for connecting to any external inference service
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
 ## Sample Configuration
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -9260,6 +9260,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -9285,15 +9349,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -9369,70 +9425,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -9890,7 +9882,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -9976,6 +9976,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10001,15 +10065,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10085,70 +10141,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10606,7 +10598,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -405,11 +405,6 @@ fi
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
 # Additional exclusions for vllm setup
 if [[ "$TEST_SETUP" == "vllm" ]]; then
    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
 fi
 PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
 if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -260,7 +260,7 @@ class VectorStoreSearchResponsePage(BaseModel):
    """
    object: str = "vector_store.search_results.page"
-    search_query: str
+    search_query: list[str]
    data: list[VectorStoreSearchResponse]
    has_more: bool = False
    next_page: str | None = None
@ -478,7 +478,7 @@ class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
    name: str | None = None
    file_ids: list[str] | None = None
    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
    metadata: dict[str, Any] | None = None
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -20,6 +20,8 @@ from llama_stack.apis.vector_io import (
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
    VectorStoreFileContentsResponse,
@ -167,6 +169,13 @@ class VectorIORouter(VectorIO):
        if embedding_dimension is not None:
            params.model_extra["embedding_dimension"] = embedding_dimension
        # Set chunking strategy explicitly if not provided
        if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
            # actualize the chunking strategy to static
            params.chunking_strategy = VectorStoreChunkingStrategyStatic(
                static=VectorStoreChunkingStrategyStaticConfig()
            )
        return await provider.openai_create_vector_store(params)
    async def openai_list_vector_stores(
@ -283,6 +292,8 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
        if chunking_strategy is None or chunking_strategy.type == "auto":
            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
--- a/src/llama_stack/core/utils/config_resolution.py
+++ b/src/llama_stack/core/utils/config_resolution.py
@ -52,7 +52,17 @@ def resolve_config_or_distro(
            logger.debug(f"Using distribution: {distro_config}")
            return distro_config
-    # Strategy 3: Try as built distribution name
+    # Strategy 3: Try as distro config path (if no .yaml extension and contains a slash)
    # eg: starter::run-with-postgres-store.yaml
    # Use :: to avoid slash and confusion with a filesystem path
    if "::" in config_or_distro:
        distro_name, config_name = config_or_distro.split("::")
        distro_config = _get_distro_config_path(distro_name, config_name)
        if distro_config.exists():
            logger.info(f"Using distribution: {distro_config}")
            return distro_config
    # Strategy 4: Try as built distribution name
    distrib_config = DISTRIBS_BASE_DIR / f"llamastack-{config_or_distro}" / f"{config_or_distro}-{mode}.yaml"
    if distrib_config.exists():
        logger.debug(f"Using built distribution: {distrib_config}")
@ -63,13 +73,15 @@ def resolve_config_or_distro(
        logger.debug(f"Using built distribution: {distrib_config}")
        return distrib_config
-    # Strategy 4: Failed - provide helpful error
+    # Strategy 5: Failed - provide helpful error
    raise ValueError(_format_resolution_error(config_or_distro, mode))
-def _get_distro_config_path(distro_name: str, mode: Mode) -> Path:
+def _get_distro_config_path(distro_name: str, mode: str) -> Path:
    """Get the config file path for a distro."""
-    return DISTRO_DIR / distro_name / f"{mode}.yaml"
+    if not mode.endswith(".yaml"):
        mode = f"{mode}.yaml"
    return DISTRO_DIR / distro_name / mode
 def _format_resolution_error(config_or_distro: str, mode: Mode) -> str:
--- a/src/llama_stack/core/utils/exec.py
+++ b/src/llama_stack/core/utils/exec.py
@ -84,6 +84,15 @@ def run_command(command: list[str]) -> int:
            text=True,
            check=False,
        )
        # Print stdout and stderr if command failed
        if result.returncode != 0:
            log.error(f"Command {' '.join(command)} failed with returncode {result.returncode}")
            if result.stdout:
                log.error(f"STDOUT: {result.stdout}")
            if result.stderr:
                log.error(f"STDERR: {result.stderr}")
        return result.returncode
    except subprocess.SubprocessError as e:
        log.error(f"Subprocess error: {e}")
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -56,4 +56,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/src/llama_stack/distributions/ci-tests/ci_tests.py
@ -13,5 +13,6 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"
    template.run_configs.pop("run-with-postgres-store.yaml", None)
    return template
--- a/src/llama_stack/distributions/postgres-demo/init.py
+++ b/src/llama_stack/distributions/postgres-demo/init.py
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .postgres_demo import get_distribution_template  # noqa: F401
--- a/src/llama_stack/distributions/postgres-demo/build.yaml
+++ b/src/llama_stack/distributions/postgres-demo/build.yaml
@ -1,23 +0,0 @@
 version: 2
 distribution_spec:
  description: Quick start template for running Llama Stack with several popular providers
  providers:
    inference:
    - provider_type: remote::vllm
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: remote::chromadb
    safety:
    - provider_type: inline::llama-guard
    agents:
    - provider_type: inline::meta-reference
    tool_runtime:
    - provider_type: remote::brave-search
    - provider_type: remote::tavily-search
    - provider_type: inline::rag-runtime
    - provider_type: remote::model-context-protocol
 image_type: venv
 additional_pip_packages:
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/postgres-demo/postgres_demo.py
+++ b/src/llama_stack/distributions/postgres-demo/postgres_demo.py
@ -1,125 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
    Provider,
    ShieldInput,
    ToolGroupInput,
 )
 from llama_stack.distributions.template import (
    DistributionTemplate,
    RunConfigSettings,
 )
 from llama_stack.providers.inline.inference.sentence_transformers import SentenceTransformersInferenceConfig
 from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
 def get_distribution_template() -> DistributionTemplate:
    inference_providers = [
        Provider(
            provider_id="vllm-inference",
            provider_type="remote::vllm",
            config=VLLMInferenceAdapterConfig.sample_run_config(
                url="${env.VLLM_URL:=http://localhost:8000/v1}",
            ),
        ),
    ]
    providers = {
        "inference": [
            BuildProvider(provider_type="remote::vllm"),
            BuildProvider(provider_type="inline::sentence-transformers"),
        ],
        "vector_io": [BuildProvider(provider_type="remote::chromadb")],
        "safety": [BuildProvider(provider_type="inline::llama-guard")],
        "agents": [BuildProvider(provider_type="inline::meta-reference")],
        "tool_runtime": [
            BuildProvider(provider_type="remote::brave-search"),
            BuildProvider(provider_type="remote::tavily-search"),
            BuildProvider(provider_type="inline::rag-runtime"),
            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    name = "postgres-demo"
    vector_io_providers = [
        Provider(
            provider_id="${env.ENABLE_CHROMADB:+chromadb}",
            provider_type="remote::chromadb",
            config=ChromaVectorIOConfig.sample_run_config(
                f"~/.llama/distributions/{name}",
                url="${env.CHROMADB_URL:=}",
            ),
        ),
    ]
    default_tool_groups = [
        ToolGroupInput(
            toolgroup_id="builtin::websearch",
            provider_id="tavily-search",
        ),
        ToolGroupInput(
            toolgroup_id="builtin::rag",
            provider_id="rag-runtime",
        ),
    ]
    default_models = [
        ModelInput(
            model_id="${env.INFERENCE_MODEL}",
            provider_id="vllm-inference",
        )
    ]
    embedding_provider = Provider(
        provider_id="sentence-transformers",
        provider_type="inline::sentence-transformers",
        config=SentenceTransformersInferenceConfig.sample_run_config(),
    )
    embedding_model = ModelInput(
        model_id="nomic-embed-text-v1.5",
        provider_id=embedding_provider.provider_id,
        model_type=ModelType.embedding,
        metadata={
            "embedding_dimension": 768,
        },
    )
    return DistributionTemplate(
        name=name,
        distro_type="self_hosted",
        description="Quick start template for running Llama Stack with several popular providers",
        container_image=None,
        template_path=None,
        providers=providers,
        available_models_by_provider={},
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": inference_providers + [embedding_provider],
                    "vector_io": vector_io_providers,
                },
                default_models=default_models + [embedding_model],
                default_tool_groups=default_tool_groups,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
                storage_backends={
                    "kv_default": PostgresKVStoreConfig.sample_run_config(
                        table_name="llamastack_kvstore",
                    ),
                    "sql_default": PostgresSqlStoreConfig.sample_run_config(),
                },
            ),
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
                "8321",
                "Port for the Llama Stack distribution server",
            ),
        },
    )
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -0,0 +1,281 @@
 version: 2
 image_name: starter-gpu
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
 - inference
 - post_training
 - safety
 - scoring
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
      base_url: https://api.cerebras.ai
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
      url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
      api_key: ${env.OPENAI_API_KEY:=}
      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
      api_key: ${env.ANTHROPIC_API_KEY:=}
  - provider_id: gemini
    provider_type: remote::gemini
    config:
      api_key: ${env.GEMINI_API_KEY:=}
  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
    provider_type: remote::vertexai
    config:
      project: ${env.VERTEX_AI_PROJECT:=}
      location: ${env.VERTEX_AI_LOCATION:=us-central1}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      persistence:
        namespace: vector_io::faiss
        backend: kv_default
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      persistence:
        namespace: vector_io::sqlite_vec
        backend: kv_default
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      persistence:
        namespace: vector_io::milvus
        backend: kv_default
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      persistence:
        namespace: vector_io::chroma_remote
        backend: kv_default
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
      host: ${env.PGVECTOR_HOST:=localhost}
      port: ${env.PGVECTOR_PORT:=5432}
      db: ${env.PGVECTOR_DB:=}
      user: ${env.PGVECTOR_USER:=}
      password: ${env.PGVECTOR_PASSWORD:=}
      persistence:
        namespace: vector_io::pgvector
        backend: kv_default
  - provider_id: ${env.QDRANT_URL:+qdrant}
    provider_type: remote::qdrant
    config:
      api_key: ${env.QDRANT_API_KEY:=}
      persistence:
        namespace: vector_io::qdrant_remote
        backend: kv_default
  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
    provider_type: remote::weaviate
    config:
      weaviate_api_key: null
      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
      persistence:
        namespace: vector_io::weaviate
        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
      responses_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  post_training:
  - provider_id: huggingface-gpu
    provider_type: inline::huggingface-gpu
    config:
      checkpoint_format: huggingface
      distributed_backend: null
      device: cpu
      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        namespace: eval
        backend: kv_default
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        namespace: datasetio::huggingface
        backend: kv_default
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        namespace: datasetio::localfs
        backend: kv_default
  scoring:
  - provider_id: basic
    provider_type: inline::basic
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        namespace: batches
        backend: kv_postgres
 storage:
  backends:
    kv_postgres:
      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
    sql_postgres:
      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  stores:
    metadata:
      namespace: registry
      backend: kv_postgres
    inference:
      table_name: inference_store
      backend: sql_postgres
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      table_name: openai_conversations
      backend: sql_postgres
    prompts:
      namespace: prompts
      backend: kv_postgres
 registered_resources:
  models: []
  shields: []
  vector_dbs: []
  datasets: []
  scoring_fns: []
  benchmarks: []
  tool_groups: []
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -0,0 +1,278 @@
 version: 2
 image_name: starter
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
 - inference
 - post_training
 - safety
 - scoring
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
      base_url: https://api.cerebras.ai
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
      url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
      api_key: ${env.OPENAI_API_KEY:=}
      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
      api_key: ${env.ANTHROPIC_API_KEY:=}
  - provider_id: gemini
    provider_type: remote::gemini
    config:
      api_key: ${env.GEMINI_API_KEY:=}
  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
    provider_type: remote::vertexai
    config:
      project: ${env.VERTEX_AI_PROJECT:=}
      location: ${env.VERTEX_AI_LOCATION:=us-central1}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      persistence:
        namespace: vector_io::faiss
        backend: kv_default
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
      persistence:
        namespace: vector_io::sqlite_vec
        backend: kv_default
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
      persistence:
        namespace: vector_io::milvus
        backend: kv_default
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      persistence:
        namespace: vector_io::chroma_remote
        backend: kv_default
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
      host: ${env.PGVECTOR_HOST:=localhost}
      port: ${env.PGVECTOR_PORT:=5432}
      db: ${env.PGVECTOR_DB:=}
      user: ${env.PGVECTOR_USER:=}
      password: ${env.PGVECTOR_PASSWORD:=}
      persistence:
        namespace: vector_io::pgvector
        backend: kv_default
  - provider_id: ${env.QDRANT_URL:+qdrant}
    provider_type: remote::qdrant
    config:
      api_key: ${env.QDRANT_API_KEY:=}
      persistence:
        namespace: vector_io::qdrant_remote
        backend: kv_default
  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
    provider_type: remote::weaviate
    config:
      weaviate_api_key: null
      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
      persistence:
        namespace: vector_io::weaviate
        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
      responses_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  post_training:
  - provider_id: torchtune-cpu
    provider_type: inline::torchtune-cpu
    config:
      checkpoint_format: meta
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        namespace: eval
        backend: kv_default
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        namespace: datasetio::huggingface
        backend: kv_default
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        namespace: datasetio::localfs
        backend: kv_default
  scoring:
  - provider_id: basic
    provider_type: inline::basic
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        namespace: batches
        backend: kv_postgres
 storage:
  backends:
    kv_postgres:
      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
    sql_postgres:
      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  stores:
    metadata:
      namespace: registry
      backend: kv_postgres
    inference:
      table_name: inference_store
      backend: sql_postgres
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      table_name: openai_conversations
      backend: sql_postgres
    prompts:
      namespace: prompts
      backend: kv_postgres
 registered_resources:
  models: []
  shields: []
  vector_dbs: []
  datasets: []
  scoring_fns: []
  benchmarks: []
  tool_groups: []
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -17,6 +17,11 @@ from llama_stack.core.datatypes import (
    ToolGroupInput,
    VectorStoresConfig,
 )
 from llama_stack.core.storage.datatypes import (
    InferenceStoreReference,
    KVStoreReference,
    SqlStoreReference,
 )
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
 from llama_stack.providers.datatypes import RemoteProviderSpec
@ -36,6 +41,7 @@ from llama_stack.providers.remote.vector_io.pgvector.config import (
 )
 from llama_stack.providers.remote.vector_io.qdrant.config import QdrantVectorIOConfig
 from llama_stack.providers.remote.vector_io.weaviate.config import WeaviateVectorIOConfig
 from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
@ -181,6 +187,62 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
            provider_shield_id="${env.CODE_SCANNER_MODEL:=}",
        ),
    ]
    postgres_config = PostgresSqlStoreConfig.sample_run_config()
    default_overrides = {
        "inference": remote_inference_providers + [embedding_provider],
        "vector_io": [
            Provider(
                provider_id="faiss",
                provider_type="inline::faiss",
                config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="sqlite-vec",
                provider_type="inline::sqlite-vec",
                config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="${env.MILVUS_URL:+milvus}",
                provider_type="inline::milvus",
                config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="${env.CHROMADB_URL:+chromadb}",
                provider_type="remote::chromadb",
                config=ChromaVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}/",
                    url="${env.CHROMADB_URL:=}",
                ),
            ),
            Provider(
                provider_id="${env.PGVECTOR_DB:+pgvector}",
                provider_type="remote::pgvector",
                config=PGVectorVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    db="${env.PGVECTOR_DB:=}",
                    user="${env.PGVECTOR_USER:=}",
                    password="${env.PGVECTOR_PASSWORD:=}",
                ),
            ),
            Provider(
                provider_id="${env.QDRANT_URL:+qdrant}",
                provider_type="remote::qdrant",
                config=QdrantVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    url="${env.QDRANT_URL:=}",
                ),
            ),
            Provider(
                provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
                provider_type="remote::weaviate",
                config=WeaviateVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
                ),
            ),
        ],
        "files": [files_provider],
    }
    return DistributionTemplate(
        name=name,
@ -189,64 +251,10 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
        container_image=None,
        template_path=None,
        providers=providers,
-        additional_pip_packages=PostgresSqlStoreConfig.pip_packages(),
+        additional_pip_packages=list(set(PostgresSqlStoreConfig.pip_packages() + PostgresKVStoreConfig.pip_packages())),
        run_configs={
            "run.yaml": RunConfigSettings(
-                provider_overrides={
+                provider_overrides=default_overrides,
                    "inference": remote_inference_providers + [embedding_provider],
                    "vector_io": [
                        Provider(
                            provider_id="faiss",
                            provider_type="inline::faiss",
                            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="sqlite-vec",
                            provider_type="inline::sqlite-vec",
                            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="${env.MILVUS_URL:+milvus}",
                            provider_type="inline::milvus",
                            config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="${env.CHROMADB_URL:+chromadb}",
                            provider_type="remote::chromadb",
                            config=ChromaVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}/",
                                url="${env.CHROMADB_URL:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.PGVECTOR_DB:+pgvector}",
                            provider_type="remote::pgvector",
                            config=PGVectorVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                db="${env.PGVECTOR_DB:=}",
                                user="${env.PGVECTOR_USER:=}",
                                password="${env.PGVECTOR_PASSWORD:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.QDRANT_URL:+qdrant}",
                            provider_type="remote::qdrant",
                            config=QdrantVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                url="${env.QDRANT_URL:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
                            provider_type="remote::weaviate",
                            config=WeaviateVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
                            ),
                        ),
                    ],
                    "files": [files_provider],
                },
                default_models=[],
                default_tool_groups=default_tool_groups,
                default_shields=default_shields,
@ -261,6 +269,55 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                    default_shield_id="llama-guard",
                ),
            ),
            "run-with-postgres-store.yaml": RunConfigSettings(
                provider_overrides={
                    **default_overrides,
                    "agents": [
                        Provider(
                            provider_id="meta-reference",
                            provider_type="inline::meta-reference",
                            config=dict(
                                persistence_store=postgres_config,
                                responses_store=postgres_config,
                            ),
                        )
                    ],
                    "batches": [
                        Provider(
                            provider_id="reference",
                            provider_type="inline::reference",
                            config=dict(
                                kvstore=KVStoreReference(
                                    backend="kv_postgres",
                                    namespace="batches",
                                ).model_dump(exclude_none=True),
                            ),
                        )
                    ],
                },
                storage_backends={
                    "kv_postgres": PostgresKVStoreConfig.sample_run_config(),
                    "sql_postgres": postgres_config,
                },
                storage_stores={
                    "metadata": KVStoreReference(
                        backend="kv_postgres",
                        namespace="registry",
                    ).model_dump(exclude_none=True),
                    "inference": InferenceStoreReference(
                        backend="sql_postgres",
                        table_name="inference_store",
                    ).model_dump(exclude_none=True),
                    "conversations": SqlStoreReference(
                        backend="sql_postgres",
                        table_name="openai_conversations",
                    ).model_dump(exclude_none=True),
                    "prompts": KVStoreReference(
                        backend="kv_postgres",
                        namespace="prompts",
                    ).model_dump(exclude_none=True),
                },
            ),
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
--- a/src/llama_stack/providers/remote/inference/passthrough/init.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/init.py
@ -10,8 +10,8 @@ from .config import PassthroughImplConfig
 class PassthroughProviderDataValidator(BaseModel):
-    url: str
+    passthrough_url: str
-    api_key: str
+    passthrough_api_key: str
 async def get_adapter_impl(config: PassthroughImplConfig, _deps):
--- a/src/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import Field, SecretStr
+from pydantic import Field
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.schema_utils import json_schema_type
@ -19,11 +19,6 @@ class PassthroughImplConfig(RemoteInferenceProviderConfig):
        description="The URL for the passthrough endpoint",
    )
    api_key: SecretStr | None = Field(
        default=None,
        description="API Key for the passthrouth endpoint",
    )
    @classmethod
    def sample_run_config(
        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
--- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -5,9 +5,8 @@
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
 from typing import Any
-from llama_stack_client import AsyncLlamaStackClient
+from openai import AsyncOpenAI
 from llama_stack.apis.inference import (
    Inference,
@ -20,103 +19,117 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsResponse,
 )
 from llama_stack.apis.models import Model
-from llama_stack.core.library_client import convert_pydantic_to_json_value
+from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from .config import PassthroughImplConfig
-class PassthroughInferenceAdapter(Inference):
+class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference):
    def __init__(self, config: PassthroughImplConfig) -> None:
        ModelRegistryHelper.__init__(self)
        self.config = config
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def unregister_model(self, model_id: str) -> None:
        pass
    async def register_model(self, model: Model) -> Model:
        return model
-    def _get_client(self) -> AsyncLlamaStackClient:
+    async def list_models(self) -> list[Model]:
-        passthrough_url = None
+        """List models by calling the downstream /v1/models endpoint."""
-        passthrough_api_key = None
+        client = self._get_openai_client()
        provider_data = None
-        if self.config.url is not None:
+        response = await client.models.list()
            passthrough_url = self.config.url
        else:
            provider_data = self.get_request_provider_data()
            if provider_data is None or not provider_data.passthrough_url:
                raise ValueError(
                    'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
                )
            passthrough_url = provider_data.passthrough_url
-        if self.config.api_key is not None:
+        # Convert from OpenAI format to Llama Stack Model format
-            passthrough_api_key = self.config.api_key.get_secret_value()
+        models = []
-        else:
+        for model_data in response.data:
-            provider_data = self.get_request_provider_data()
+            downstream_model_id = model_data.id
-            if provider_data is None or not provider_data.passthrough_api_key:
+            custom_metadata = getattr(model_data, "custom_metadata", {}) or {}
                raise ValueError(
                    'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
                )
            passthrough_api_key = provider_data.passthrough_api_key
-        return AsyncLlamaStackClient(
+            # Prefix identifier with provider ID for local registry
-            base_url=passthrough_url,
+            local_identifier = f"{self.__provider_id__}/{downstream_model_id}"
-            api_key=passthrough_api_key,
+
-            provider_data=provider_data,
+            model = Model(
                identifier=local_identifier,
                provider_id=self.__provider_id__,
                provider_resource_id=downstream_model_id,
                model_type=custom_metadata.get("model_type", "llm"),
                metadata=custom_metadata,
            )
            models.append(model)
        return models
    async def should_refresh_models(self) -> bool:
        """Passthrough should refresh models since they come from downstream dynamically."""
        return self.config.refresh_models
    def _get_openai_client(self) -> AsyncOpenAI:
        """Get an AsyncOpenAI client configured for the downstream server."""
        base_url = self._get_passthrough_url()
        api_key = self._get_passthrough_api_key()
        return AsyncOpenAI(
            base_url=f"{base_url.rstrip('/')}/v1",
            api_key=api_key,
        )
-    async def openai_embeddings(
+    def _get_passthrough_url(self) -> str:
-        self,
+        """Get the passthrough URL from config or provider data."""
-        params: OpenAIEmbeddingsRequestWithExtraBody,
+        if self.config.url is not None:
-    ) -> OpenAIEmbeddingsResponse:
+            return self.config.url
-        raise NotImplementedError()
+
        provider_data = self.get_request_provider_data()
        if provider_data is None:
            raise ValueError(
                'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
            )
        return provider_data.passthrough_url
    def _get_passthrough_api_key(self) -> str:
        """Get the passthrough API key from config or provider data."""
        if self.config.auth_credential is not None:
            return self.config.auth_credential.get_secret_value()
        provider_data = self.get_request_provider_data()
        if provider_data is None:
            raise ValueError(
                'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
            )
        return provider_data.passthrough_api_key
    async def openai_completion(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
-        client = self._get_client()
+        """Forward completion request to downstream using OpenAI client."""
-        model_obj = await self.model_store.get_model(params.model)
+        client = self._get_openai_client()
        params = params.model_copy()
        params.model = model_obj.provider_resource_id
        request_params = params.model_dump(exclude_none=True)
-
+        response = await client.completions.create(**request_params)
-        return await client.inference.openai_completion(**request_params)
+        return response  # type: ignore
    async def openai_chat_completion(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        client = self._get_client()
+        """Forward chat completion request to downstream using OpenAI client."""
-        model_obj = await self.model_store.get_model(params.model)
+        client = self._get_openai_client()
        params = params.model_copy()
        params.model = model_obj.provider_resource_id
        request_params = params.model_dump(exclude_none=True)
        response = await client.chat.completions.create(**request_params)
        return response  # type: ignore
-        return await client.inference.openai_chat_completion(**request_params)
+    async def openai_embeddings(
-
+        self,
-    def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]:
+        params: OpenAIEmbeddingsRequestWithExtraBody,
-        json_params = {}
+    ) -> OpenAIEmbeddingsResponse:
-        for key, value in request_params.items():
+        """Forward embeddings request to downstream using OpenAI client."""
-            json_input = convert_pydantic_to_json_value(value)
+        client = self._get_openai_client()
-            if isinstance(json_input, dict):
+        request_params = params.model_dump(exclude_none=True)
-                json_input = {k: v for k, v in json_input.items() if v is not None}
+        response = await client.embeddings.create(**request_params)
-            elif isinstance(json_input, list):
+        return response  # type: ignore
                json_input = [x for x in json_input if x is not None]
                new_input = []
                for x in json_input:
                    if isinstance(x, dict):
                        x = {k: v for k, v in x.items() if v is not None}
                    new_input.append(x)
                json_input = new_input
            json_params[key] = json_input
        return json_params
--- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -26,6 +26,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyAuto,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreContent,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
@ -414,6 +415,10 @@ class OpenAIVectorStoreMixin(ABC):
            in_progress=0,
            total=0,
        )
        if not params.chunking_strategy or params.chunking_strategy.type == "auto":
            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        else:
            chunking_strategy = params.chunking_strategy
        store_info: dict[str, Any] = {
            "id": vector_store_id,
            "object": "vector_store",
@ -426,7 +431,7 @@ class OpenAIVectorStoreMixin(ABC):
            "expires_at": None,
            "last_active_at": created_at,
            "file_ids": [],
-            "chunking_strategy": params.chunking_strategy,
+            "chunking_strategy": chunking_strategy.model_dump(),
        }
        # Add provider information to metadata if provided
@ -637,7 +642,7 @@ class OpenAIVectorStoreMixin(ABC):
                    break
            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=data,
                has_more=False,  # For simplicity, we don't implement pagination here
                next_page=None,
@ -647,7 +652,7 @@ class OpenAIVectorStoreMixin(ABC):
            logger.error(f"Error searching vector store {vector_store_id}: {e}")
            # Return empty results on error
            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=[],
                has_more=False,
                next_page=None,
@ -886,8 +891,8 @@ class OpenAIVectorStoreMixin(ABC):
        # Determine pagination info
        has_more = len(file_objects) > limit
-        first_id = file_objects[0].id if file_objects else None
+        first_id = limited_files[0].id if file_objects else None
-        last_id = file_objects[-1].id if file_objects else None
+        last_id = limited_files[-1].id if file_objects else None
        return VectorStoreListFilesResponse(
            data=limited_files,
--- a/tests/integration/ci_matrix.json
+++ b/tests/integration/ci_matrix.json
@ -2,7 +2,8 @@
  "default": [
    {"suite": "base", "setup": "ollama"},
    {"suite": "vision", "setup": "ollama-vision"},
-    {"suite": "responses", "setup": "gpt"}
+    {"suite": "responses", "setup": "gpt"},
    {"suite": "base-vllm-subset", "setup": "vllm"}
  ],
  "schedules": {
    "1 0 * * 0": [
--- a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json
+++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": null,
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374291,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-f70298e4ea3e4b4eb7f2cc2deb7a2b01",
              "object": "model_permission",
              "created": 1762374291,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/0248ff8a1be5ff5ba88046947059ffbde15a1c52adbeea456bb42abdfc931bd1.json
+++ b/tests/integration/inference/recordings/0248ff8a1be5ff5ba88046947059ffbde15a1c52adbeea456bb42abdfc931bd1.json
--- a/tests/integration/inference/recordings/452805c3c85951c86e4e5dfeef078a2e184866dafee83186cd84932daae1af42.json
+++ b/tests/integration/inference/recordings/452805c3c85951c86e4e5dfeef078a2e184866dafee83186cd84932daae1af42.json
@ -0,0 +1,84 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestEdgeCases::test_tool_without_schema[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "Call the no args tool"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "no_args_tool",
            "description": "Tool with no arguments",
            "parameters": {
              "type": "object",
              "properties": {}
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-452805c3c859",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user wants me to call the no args tool. Let me check the available functions. There's only one tool provided, which is the no_args_tool with no arguments. Since the user didn't specify any parameters, I should just return the tool call as instructed. I need to make sure the JSON is correctly formatted and within the XML tags. Alright, that's all I need.\n</think>\n\n",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "chatcmpl-tool-7a67269afe214c85924c5171612bbdbd",
                  "function": {
                    "arguments": "{}",
                    "name": "no_args_tool"
                  },
                  "type": "function"
                }
              ],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 101,
          "prompt_tokens": 136,
          "total_tokens": 237,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/496035259763c1bddb1a3148c2586663d08a5bc31f697d1fc5d9bed1c71f5950.json
+++ b/tests/integration/inference/recordings/496035259763c1bddb1a3148c2586663d08a5bc31f697d1fc5d9bed1c71f5950.json
@ -0,0 +1,92 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestOpenAICompatibility::test_openai_chat_completion_with_tools[openai_client-txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "What's the weather in Tokyo?"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get weather information",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "City name"
                }
              },
              "required": [
                "location"
              ]
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-496035259763",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user is asking about the weather in Tokyo. I need to use the get_weather function for that. The function requires the location parameter, which in this case is Tokyo. I should make sure to specify \"Tokyo\" as the location. Let me check if there are any other parameters needed, but no, the function only needs the location. So the tool call should be straightforward. I'll format the JSON correctly inside the tool_call tags.\n</think>\n\n",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "chatcmpl-tool-959b557fa67e4134a2391f5d35e5d5ae",
                  "function": {
                    "arguments": "{\"location\": \"Tokyo\"}",
                    "name": "get_weather"
                  },
                  "type": "function"
                }
              ],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 117,
          "prompt_tokens": 158,
          "total_tokens": 275,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/524ead18daaddb6228284820adaa3fb312d2a525cc35e20c181190ddf40793e6.json
+++ b/tests/integration/inference/recordings/524ead18daaddb6228284820adaa3fb312d2a525cc35e20c181190ddf40793e6.json
@ -0,0 +1,92 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestOpenAICompatibility::test_openai_format_preserves_complex_schemas[openai_client-txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "Process this data"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "process_data",
            "description": "Process structured data",
            "parameters": {
              "type": "object",
              "properties": {
                "data": {
                  "$ref": "#/$defs/DataObject"
                }
              },
              "$defs": {
                "DataObject": {
                  "type": "object",
                  "properties": {
                    "values": {
                      "type": "array",
                      "items": {
                        "type": "number"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-524ead18daad",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user wants me to process the data. Let me check the available tools. There's a function called process_data that takes an object with a 'data' parameter. The data is an array of numbers. But the user hasn't provided any specific data yet. They just said \"Process this data.\" Hmm, maybe they expect me to prompt them for the data first. Wait, maybe there's a misunderstanding. Did they include the data in the conversation history? Let me look back. The user's message is \"Process this data.\" No data provided. Oh, maybe they made a mistake and forgot to include it. I need to ask them to provide the data so I can proceed. Let me confirm if there's any data mentioned. No, the current input is just the instruction. So I should ask the user to supply the data array of numbers to process.\n</think>\n\nPlease provide the structured data you'd like me to process. For example, an array of numbers like `[1, 2, 3]`.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 212,
          "prompt_tokens": 180,
          "total_tokens": 392,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/65eba1be095a7037c4f197f4168b310ebc8afc00aba3946ba498abe2fdbe6a63.json
+++ b/tests/integration/inference/recordings/65eba1be095a7037c4f197f4168b310ebc8afc00aba3946ba498abe2fdbe6a63.json
--- a/tests/integration/inference/recordings/744052775cf90e30dac587e6b809d41a8cc37adc29c500eecee2727f428cbf5a.json
+++ b/tests/integration/inference/recordings/744052775cf90e30dac587e6b809d41a8cc37adc29c500eecee2727f428cbf5a.json
@ -0,0 +1,98 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_with_tools[txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:tool_calling]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "system",
          "content": "Pretend you are a weather assistant."
        },
        {
          "role": "user",
          "content": "What's the weather like in San Francisco, CA?"
        }
      ],
      "max_tokens": 4096,
      "stream": false,
      "tool_choice": "auto",
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get the current weather",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "The city and state (both required), e.g. San Francisco, CA."
                }
              },
              "required": [
                "location"
              ]
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-744052775cf9",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user is asking about the weather in San Francisco, CA. I need to use the get_weather function. The function requires the location parameter, which is provided as San Francisco, CA. I should make sure to format the arguments correctly as a JSON object. Let me check the required parameters again. The location is required, so I can't omit it. I'll structure the tool call with the name \"get_weather\" and the arguments including \"location\": \"San Francisco, CA\". That should get the current weather information for the user.\n</think>\n\n",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "chatcmpl-tool-b59dc311dd914d3dbd6d455b122bc39c",
                  "function": {
                    "arguments": "{\"location\": \"San Francisco, CA\"}",
                    "name": "get_weather"
                  },
                  "type": "function"
                }
              ],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 138,
          "prompt_tokens": 185,
          "total_tokens": 323,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/77cf218283607bfac37623e1bb4e6f33cae670df7d6995d432bca34c5dfb0e43.json
+++ b/tests/integration/inference/recordings/77cf218283607bfac37623e1bb4e6f33cae670df7d6995d432bca34c5dfb0e43.json
@ -0,0 +1,67 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_with_tool_choice_none[txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:tool_calling]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "system",
          "content": "Pretend you are a weather assistant."
        },
        {
          "role": "user",
          "content": "What's the weather like in San Francisco, CA?"
        }
      ],
      "max_tokens": 4096,
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-77cf21828360",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user is asking about the weather in San Francisco, CA. I need to check the current weather conditions. But wait, I can't access real-time data. I should mention that I can't provide the current weather forecast and ask them to check a reliable source like the National Weather Service or a weather app. Also, maybe suggest they can provide more details if they need help with something else related to the weather.\n</think>\n\nI'm sorry, but I can't provide real-time weather information. However, you can check the current weather for San Francisco, CA using the National Weather Service (NWS) website, weather apps like Weather.com, or local meteorological services. Let me know if there's anything else I can assist with!",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 154,
          "prompt_tokens": 33,
          "total_tokens": 187,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/853f6a700b98d71d390b7d366e27133a22772fbdf11863158349c1b0625bbc72.json
+++ b/tests/integration/inference/recordings/853f6a700b98d71d390b7d366e27133a22772fbdf11863158349c1b0625bbc72.json
@ -0,0 +1,128 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestEdgeCases::test_multiple_tools_with_different_schemas[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "Use one of the available tools"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "simple",
            "parameters": {
              "type": "object",
              "properties": {
                "x": {
                  "type": "string"
                }
              }
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "complex",
            "parameters": {
              "type": "object",
              "properties": {
                "data": {
                  "$ref": "#/$defs/Complex"
                }
              },
              "$defs": {
                "Complex": {
                  "type": "object",
                  "properties": {
                    "nested": {
                      "type": "array",
                      "items": {
                        "type": "number"
                      }
                    }
                  }
                }
              }
            }
          }
        },
        {
          "type": "function",
          "function": {
            "name": "with_output",
            "parameters": {
              "type": "object",
              "properties": {
                "input": {
                  "type": "string"
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-853f6a700b98",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, let's see. The user wants me to use one of the available tools. The tools provided are simple, complex, and with_output. The simple function takes an argument 'x' of type string. The complex function requires a 'data' parameter that's an object with a nested array of numbers. The with_output function takes an input string.\n\nThe user's query is about using a tool, but there's no specific function name mentioned. Wait, maybe the user expects me to choose one based on the context. Since the tools are available, but the query is a general instruction, perhaps I should ask for clarification. However, the instructions say to use one of the tools if possible. Since the user hasn't specified a particular function, maybe I should check if there's any implicit function needed. But looking at the tools, none are directly related to the query. The user might need to specify which tool to use. Alternatively, maybe the answer is to call the simple function with an example input. But without more context, it's hard to tell. Wait, maybe the user expects me to choose the simplest one. Let's go with the simple function first. So the tool call would be to the simple function with x set to some value. But the user hasn't provided a specific value. Maybe I should state that the tool requires a value. But according to the instructions, if possible, use one of the tools. Since the user hasn't given a value, perhaps the answer is to call the simple function with an example. But the parameters for the simple function require 'x' which is a string. Maybe the user expects me to proceed without needing more info. So I'll proceed by calling the simple function with x as \"example\".\n</think>\n\n",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "chatcmpl-tool-12e2ba0189cf484bb936cbb254a5c32a",
                  "function": {
                    "arguments": "{\"x\": \"example\"}",
                    "name": "simple"
                  },
                  "type": "function"
                }
              ],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 378,
          "prompt_tokens": 265,
          "total_tokens": 643,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/cfb292c0f41dbc4a2c0fb39016760f6c23c248a7bbffea19ac6bcab7bf25292d.json
+++ b/tests/integration/inference/recordings/cfb292c0f41dbc4a2c0fb39016760f6c23c248a7bbffea19ac6bcab7bf25292d.json
@ -0,0 +1,114 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestChatCompletionWithTools::test_tool_with_complex_schema[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "Book a flight from SFO to JFK for John Doe"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "book_flight",
            "description": "Book a flight",
            "parameters": {
              "type": "object",
              "properties": {
                "flight": {
                  "$ref": "#/$defs/FlightInfo"
                },
                "passenger": {
                  "$ref": "#/$defs/Passenger"
                }
              },
              "required": [
                "flight",
                "passenger"
              ],
              "$defs": {
                "FlightInfo": {
                  "type": "object",
                  "properties": {
                    "from": {
                      "type": "string"
                    },
                    "to": {
                      "type": "string"
                    },
                    "date": {
                      "type": "string",
                      "format": "date"
                    }
                  }
                },
                "Passenger": {
                  "type": "object",
                  "properties": {
                    "name": {
                      "type": "string"
                    },
                    "age": {
                      "type": "integer"
                    }
                  }
                }
              }
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-cfb292c0f41d",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user wants to book a flight from SFO to JFK for John Doe. Let me check the tools available. The provided function is book_flight, which requires flight information and a passenger. The parameters needed are flight (as a FlightInfo object) and passenger (with name and age). The user mentioned SFO to JFK, so the flight details are from and to. The passenger's name is John Doe, but the age isn't provided. Wait, the function parameters require the passenger's name and age, but the user only mentioned the name. Maybe the age is missing? But the user didn't specify it, so perhaps I should note that the age is required. However, the function's required parameters are flight and passenger, so even if age is missing, the function can't be called without it. So I need to include both flight info and passenger details. The user's message only gives the name and destination, not the flight details or age. Therefore, I need to ask for the flight details and the passenger's age. But the user hasn't provided those. So I can't proceed with the function call. Wait, but maybe the user expects me to assume some default? No, the function requires all parameters. Since the user hasn't provided flight details or age, I can't call the function. So the correct response is to prompt the user for those details.\n</think>\n\nThe booking requires the flight details and passenger's age. Could you provide the flight number and John Doe's age?",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 310,
          "prompt_tokens": 261,
          "total_tokens": 571,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/df353403c7fb59ed88c52269261b3dd9b75f681f8bb5431b4f07006d6c08aa7c.json
+++ b/tests/integration/inference/recordings/df353403c7fb59ed88c52269261b3dd9b75f681f8bb5431b4f07006d6c08aa7c.json
@ -0,0 +1,96 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_structured_output[txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:structured_output]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "system",
          "content": "You are a helpful assistant. Michael Jordan was born in 1963. His first name is \"Michael\", He played basketball for the Chicago Bulls for 15 seasons and was drafted in 1984"
        },
        {
          "role": "user",
          "content": "Please give me information about Michael Jordan."
        }
      ],
      "max_tokens": 4096,
      "response_format": {
        "type": "json_schema",
        "json_schema": {
          "name": "AnswerFormat",
          "schema": {
            "properties": {
              "first_name": {
                "title": "First Name",
                "type": "string"
              },
              "last_name": {
                "title": "Last Name",
                "type": "string"
              },
              "year_of_birth": {
                "title": "Year Of Birth",
                "type": "integer"
              }
            },
            "required": [
              "first_name",
              "last_name",
              "year_of_birth"
            ],
            "title": "AnswerFormat",
            "type": "object"
          }
        }
      },
      "stream": false
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-df353403c7fb",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "{\"first_name\": \"Michael\", \"last_name\": \"Jordan\", \"year_of_birth\": 1963}",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 28,
          "prompt_tokens": 66,
          "total_tokens": 94,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/e89112e7735fccc5ad9ebe6a96454953aed0ba2501cabfaa80b742c2bf371cbc.json
+++ b/tests/integration/inference/recordings/e89112e7735fccc5ad9ebe6a96454953aed0ba2501cabfaa80b742c2bf371cbc.json
@ -0,0 +1,92 @@
 {
  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestChatCompletionWithTools::test_simple_tool_call[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "messages": [
        {
          "role": "user",
          "content": "What's the weather in San Francisco?"
        }
      ],
      "max_tokens": 4096,
      "tools": [
        {
          "type": "function",
          "function": {
            "name": "get_weather",
            "description": "Get weather for a location",
            "parameters": {
              "type": "object",
              "properties": {
                "location": {
                  "type": "string",
                  "description": "City name"
                }
              },
              "required": [
                "location"
              ]
            }
          }
        }
      ]
    },
    "endpoint": "/v1/chat/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-e89112e7735f",
        "choices": [
          {
            "finish_reason": "tool_calls",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "<think>\nOkay, the user is asking for the weather in San Francisco. I need to check if there's a function available for that. Looking at the tools provided, there's a function called get_weather that requires a location parameter. The description says it gets weather for a location, and the parameter is the city name. The user provided \"San Francisco\" as the location, so I should call the get_weather function with \"San Francisco\" as the argument. I don't see any other parameters needed here, so the tool call should be straightforward. Just make sure the city name is correctly formatted in JSON.\n</think>\n\n",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": [
                {
                  "id": "chatcmpl-tool-feead29842dc40b2831c41ed397f555f",
                  "function": {
                    "arguments": "{\"location\": \"San Francisco\"}",
                    "name": "get_weather"
                  },
                  "type": "function"
                }
              ],
              "reasoning_content": null
            },
            "stop_reason": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 146,
          "prompt_tokens": 161,
          "total_tokens": 307,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "prompt_logprobs": null,
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json
+++ b/tests/integration/inference/recordings/f02f1bfd75adaea87b91dedc59430b99015b5ed0e2bbf24418a31146ffcbca9b.json
@ -0,0 +1,53 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/completions",
    "headers": {},
    "body": {
      "model": "Qwen/Qwen3-0.6B",
      "prompt": "I am feeling really sad today.",
      "stream": false,
      "extra_body": {
        "guided_choice": [
          "joy",
          "sadness"
        ]
      }
    },
    "endpoint": "/v1/completions",
    "model": "Qwen/Qwen3-0.6B"
  },
  "response": {
    "body": {
      "__type__": "openai.types.completion.Completion",
      "__data__": {
        "id": "rec-f02f1bfd75ad",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "text": "joy",
            "stop_reason": null,
            "prompt_logprobs": null
          }
        ],
        "created": 0,
        "model": "Qwen/Qwen3-0.6B",
        "object": "text_completion",
        "system_fingerprint": null,
        "usage": {
          "completion_tokens": 2,
          "prompt_tokens": 7,
          "total_tokens": 9,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        },
        "kv_transfer_params": null
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-0037f2d2065a360cfcc36c35f138318cfc6508e743ff9423da4b7b1d7bfd4f3f-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-0037f2d2065a360cfcc36c35f138318cfc6508e743ff9423da4b7b1d7bfd4f3f-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375180,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-aeeb49e5e51c42fa94562780165bd620",
              "object": "model_permission",
              "created": 1762375180,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-009fb75503cf565d6c97f70deb8235432b0020b93d55e3b33ea093664c4bbc82-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-009fb75503cf565d6c97f70deb8235432b0020b93d55e3b33ea093664c4bbc82-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375115,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-feec0a894be04f738e12b596ff163b64",
              "object": "model_permission",
              "created": 1762375115,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-01e6ee9852f532d9b0d82dde2e7c831d698e81dea1be69433050d42643f35edc-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-01e6ee9852f532d9b0d82dde2e7c831d698e81dea1be69433050d42643f35edc-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_stop_sequence[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:stop_sequence]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374330,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-119e17052e4c4c13bd791af3138d5360",
              "object": "model_permission",
              "created": 1762374330,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-10370bf5307b2fc971b8e53bdcc4e9eb4d3d76fe8ecdb31231b59576a612e972-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-10370bf5307b2fc971b8e53bdcc4e9eb4d3d76fe8ecdb31231b59576a612e972-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_base64_batch_processing[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375226,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-c6ae673fda084519b3c67947896cd3b0",
              "object": "model_permission",
              "created": 1762375226,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-1312e0d8579e9b0e6dcb222272de34115277db71c6c560872fa13722197f881f-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-1312e0d8579e9b0e6dcb222272de34115277db71c6c560872fa13722197f881f-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_base64[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374573,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-3f422354a81e491b87f93d5b192a0e1a",
              "object": "model_permission",
              "created": 1762374573,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-134e731d073e9e07eb9782bbe292167f8ad08157a15150ce92135854d04050fc-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-134e731d073e9e07eb9782bbe292167f8ad08157a15150ce92135854d04050fc-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374305,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-794e16e59ddb4216a8bedfdf485b8f24",
              "object": "model_permission",
              "created": 1762374305,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-14c0905df1b177d2f85b30b0285b0ffdc88d1a7b290e2155fb7a01f3c1436ca0-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-14c0905df1b177d2f85b30b0285b0ffdc88d1a7b290e2155fb7a01f3c1436ca0-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=vllm/Qwen/Qwen3-0.6B-True]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374317,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-ff7d26d076eb4373a0631a80fe3ae063",
              "object": "model_permission",
              "created": 1762374317,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-1bc879637162ba23badeea66c4c25a638869a3e90d16ef3e84dea1a613e7192e-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-1bc879637162ba23badeea66c4c25a638869a3e90d16ef3e84dea1a613e7192e-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375033,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-2a16fede981b43be9e1cbe3dbedd1e74",
              "object": "model_permission",
              "created": 1762375033,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-2b9bac5da1a03c0b572bc019cc0c50904d49e6193990ca245908f4535bcaab43-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-2b9bac5da1a03c0b572bc019cc0c50904d49e6193990ca245908f4535bcaab43-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374297,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-4bc93704559a4e1d8492aeec7222040c",
              "object": "model_permission",
              "created": 1762374297,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-394c30370fe5b724c5fe1292984373b281d47b2ac0d49e8b598f13cf100b3ad8-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-394c30370fe5b724c5fe1292984373b281d47b2ac0d49e8b598f13cf100b3ad8-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_invalid_model_error[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374532,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-e353aa079d5145c19953791ac99daeba",
              "object": "model_permission",
              "created": 1762374532,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-3f4208962fdb2be3e7057777fc93a149890bd1dfa8a92597e176f23658e86cd8-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-3f4208962fdb2be3e7057777fc93a149890bd1dfa8a92597e176f23658e86cd8-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-True]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375260,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-10c27d1c9e324b18b65321b422e19af9",
              "object": "model_permission",
              "created": 1762375260,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-4a729b00af209ad60846d1904e5973ad081aa5f595de50f5ef1aae304cb67ef3-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-4a729b00af209ad60846d1904e5973ad081aa5f595de50f5ef1aae304cb67ef3-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375040,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-f01c211577294936958dd28046c89dba",
              "object": "model_permission",
              "created": 1762375040,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-4f62bcb9cdf74f4c2ed804038def162f18ad384182b0f174918607e9ed3c1515-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-4f62bcb9cdf74f4c2ed804038def162f18ad384182b0f174918607e9ed3c1515-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:non_streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375266,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-7166a6fcd331435eb2d0f0a6b23382ed",
              "object": "model_permission",
              "created": 1762375266,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-54b420cdb98a0149a618088f55746e26b7bf6e7c5ebf5fa07c13ec9e366521d3-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-54b420cdb98a0149a618088f55746e26b7bf6e7c5ebf5fa07c13ec9e366521d3-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374301,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-cd16b092c5a04e719ddf786f0c3e935e",
              "object": "model_permission",
              "created": 1762374301,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-613f5d11a8cda7126115f96650334fde0a0457a6b4a2605bc15eec9b50a6956c-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-613f5d11a8cda7126115f96650334fde0a0457a6b4a2605bc15eec9b50a6956c-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:sanity]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374295,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-9f71adbb206846bb9d0e12834e41551e",
              "object": "model_permission",
              "created": 1762374295,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-62a361f55d61a98ea0863e9acfb5ab5d540c5d19e791415ee476474f7f1ed90f-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-62a361f55d61a98ea0863e9acfb5ab5d540c5d19e791415ee476474f7f1ed90f-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_logprobs_streaming[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:log_probs]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374342,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-a8b7b38c40584a03b4b346b6c181fb93",
              "object": "model_permission",
              "created": 1762374342,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-668fc72f70ac72d5c112fe79d86d5c790611456b3f0102832f27e6edd420ab54-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-668fc72f70ac72d5c112fe79d86d5c790611456b3f0102832f27e6edd420ab54-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:non_streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375235,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-dd48560646f141298f5cc2ef3467e54b",
              "object": "model_permission",
              "created": 1762375235,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-702eee4572e9b17ff0b0fdd55b10021f7077f0afcba922d6a53db0b537542518-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-702eee4572e9b17ff0b0fdd55b10021f7077f0afcba922d6a53db0b537542518-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_user_parameter[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374500,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-0ba0c3a54dcb4e57bc0308fd54425933",
              "object": "model_permission",
              "created": 1762374500,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-723d37a5bceab199cff076a0dcc2d4ee7596b7c800f13c64f6a6ecdbf4ed2f3a-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-723d37a5bceab199cff076a0dcc2d4ee7596b7c800f13c64f6a6ecdbf4ed2f3a-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=vllm/Qwen/Qwen3-0.6B-True]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374311,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-e95a9ed7439245b5995add97fb50f765",
              "object": "model_permission",
              "created": 1762374311,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-768c497339830cf86ddd7843f33d0ed06b3bce3ef2ae9f854364b534ba8cafb7-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-768c497339830cf86ddd7843f33d0ed06b3bce3ef2ae9f854364b534ba8cafb7-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375099,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-6b5eba46536f43df902871dd257e1676",
              "object": "model_permission",
              "created": 1762375099,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-7b0f2493d699e58cdfe0a9dab38f4423771c8ebced2020b1e15cbb35470c1ca2-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-7b0f2493d699e58cdfe0a9dab38f4423771c8ebced2020b1e15cbb35470c1ca2-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_base64[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375207,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-bbfbcf20cac146e0ae5e45ae6a42632d",
              "object": "model_permission",
              "created": 1762375207,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-7ed97509ff199eabe1380caa36b9e5934e9d04a9cafcfa2d21d20f6f85679ae4-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-7ed97509ff199eabe1380caa36b9e5934e9d04a9cafcfa2d21d20f6f85679ae4-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375273,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-4935d35e00fd4acdbe78662f42342e77",
              "object": "model_permission",
              "created": 1762375273,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-805e6b510b1ab33505a1af85c0d2a766cd3415512212d80f6292ca0ef5c359e1-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-805e6b510b1ab33505a1af85c0d2a766cd3415512212d80f6292ca0ef5c359e1-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_base64_batch_processing[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374591,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-e19031997a1e44d99c8b5ae55725a887",
              "object": "model_permission",
              "created": 1762374591,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-860b7e8309e0761e20e845be75c0a28d759384a367f6308f2a921702318a5dba-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-860b7e8309e0761e20e845be75c0a28d759384a367f6308f2a921702318a5dba-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:non_streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375027,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-ec44b40a73b04912a837001376b59cff",
              "object": "model_permission",
              "created": 1762375027,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-8903569d538f9836ac6251d90c4668d3057e8e0ced847a08fd7a6faedb5710c3-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-8903569d538f9836ac6251d90c4668d3057e8e0ced847a08fd7a6faedb5710c3-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_single_string[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374356,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-3203232f1dbd426aba98ef1593dd3c01",
              "object": "model_permission",
              "created": 1762374356,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-8aa8c593dd64639678c294146fd56804393856c6e85197e6317ebd88351be21d-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-8aa8c593dd64639678c294146fd56804393856c6e85197e6317ebd88351be21d-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375248,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-5efe67a621074e979edaaf8fcfee9a80",
              "object": "model_permission",
              "created": 1762375248,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-8fc4c7b563b9bd423b74dcb4683039248f41d86c02703bd2dce845d972c9ae6f-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-8fc4c7b563b9bd423b74dcb4683039248f41d86c02703bd2dce845d972c9ae6f-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_user_parameter[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375135,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-5509cf924e5e4fc89091e4593f264258",
              "object": "model_permission",
              "created": 1762375135,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-99ae704b53e3e3150cac5cd579e446e6545a4ab6a63048ce00ee1fbe5fbf1b4e-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-99ae704b53e3e3150cac5cd579e446e6545a4ab6a63048ce00ee1fbe5fbf1b4e-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:non_streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374301,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-72ed55b56df1471b9f71c48bacf8b768",
              "object": "model_permission",
              "created": 1762374301,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-9a36a281899f0800f085473f5f0185b09a02022607b965ac08b4db2e9e7eabc9-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-9a36a281899f0800f085473f5f0185b09a02022607b965ac08b4db2e9e7eabc9-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:suffix]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374295,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-abbbfbb49abc4312b2b2011d4d2ba19b",
              "object": "model_permission",
              "created": 1762374295,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-9beac41c66cbe8568bb72b5ba0f5608597ef8a14b42585c22b1e7c45526537c1-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-9beac41c66cbe8568bb72b5ba0f5608597ef8a14b42585c22b1e7c45526537c1-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_single_string[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375065,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-d943fbda14264715906334300853cec7",
              "object": "model_permission",
              "created": 1762375065,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-a495ae010d48bb3649c822e3299e819c164c2311db231c81296ff4c72e6f81cb-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-a495ae010d48bb3649c822e3299e819c164c2311db231c81296ff4c72e6f81cb-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=vllm/Qwen/Qwen3-0.6B]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374323,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-15a0a1106fff4fdd8ce7574373fe3cee",
              "object": "model_permission",
              "created": 1762374323,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-a77b3cb7370fd9f46e6ea12d72e1d9a8e7515f745289e93e5eb4a21d0e7b71b7-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-a77b3cb7370fd9f46e6ea12d72e1d9a8e7515f745289e93e5eb4a21d0e7b71b7-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_multiple_strings[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375082,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-ef1d3bc6fefc432380ef0eabdf216fd3",
              "object": "model_permission",
              "created": 1762375082,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-a82e913e058618dcb30b269a54d4e6a9cb1e0017a42efe04480874fe957194d4-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-a82e913e058618dcb30b269a54d4e6a9cb1e0017a42efe04480874fe957194d4-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_invalid_model_error[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375165,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-b29f7386725b4f13976cd76b6dc3a278",
              "object": "model_permission",
              "created": 1762375165,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-bee98cb55c3b74854d0bb71b23b7e01bbb9f1580b413a26dc3afbf9da8b7d995-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-bee98cb55c3b74854d0bb71b23b7e01bbb9f1580b413a26dc3afbf9da8b7d995-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_logprobs[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:log_probs]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374336,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-21db8cc1a31e41eaaa4e653435618645",
              "object": "model_permission",
              "created": 1762374336,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-c3d9f0302c09cecba4c3797ec2d65e358910e6194e13d1001fd3567ab2eff6aa-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-c3d9f0302c09cecba4c3797ec2d65e358910e6194e13d1001fd3567ab2eff6aa-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374547,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-09fed2c5660e42658ab23c6d17b7840c",
              "object": "model_permission",
              "created": 1762374547,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-c6e251660301fe3f503b4c31dcb551087ca9118e65b97bd894954847723a9be0-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-c6e251660301fe3f503b4c31dcb551087ca9118e65b97bd894954847723a9be0-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=vllm/Qwen/Qwen3-0.6B-inference:completion:sanity]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374297,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-70d68a901d2445f6b7f470c600b34c78",
              "object": "model_permission",
              "created": 1762374297,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-cb1f7d5cd412fddb3395ef125bbcdac95c85585f23684e71abf142004b164bbc-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-cb1f7d5cd412fddb3395ef125bbcdac95c85585f23684e71abf142004b164bbc-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=vllm/Qwen/Qwen3-0.6B-False]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375047,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-a8446fd6718649399402526dc6fe1477",
              "object": "model_permission",
              "created": 1762375047,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-cbecbec285766025f2bebca94904e63578190f33b47eb6f32cb4635a1b43e3cf-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-cbecbec285766025f2bebca94904e63578190f33b47eb6f32cb4635a1b43e3cf-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-True]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375254,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-f6eb51901e6443e492061deac904737c",
              "object": "model_permission",
              "created": 1762375254,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-d650458718dae3a10405ce1d241f0e1ceeeae8edf516cf10c611edcdf64035e3-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-d650458718dae3a10405ce1d241f0e1ceeeae8edf516cf10c611edcdf64035e3-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_02]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375279,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-31e50ba39ad84a7daa1a24a3c77dc550",
              "object": "model_permission",
              "created": 1762375279,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-d8acc76e3d1b54eac9754a9d3a72c571fe3078b227a257aa15afdba946b69665-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-d8acc76e3d1b54eac9754a9d3a72c571fe3078b227a257aa15afdba946b69665-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-inference:chat_completion:streaming_01]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375241,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-3ebcc379347541ea94de0f91838829e5",
              "object": "model_permission",
              "created": 1762375241,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-d9ff5f5ffaa7a64101936007fbe61cf2ed54f67609b54b56d92cb949234e3799-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-d9ff5f5ffaa7a64101936007fbe61cf2ed54f67609b54b56d92cb949234e3799-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_multiple_strings[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374449,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-311f880045284a469a286b8039177d10",
              "object": "model_permission",
              "created": 1762374449,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-da380037dc0fe8ae61b838baf268e616057e46f8424df0a9b52f94e48cef4a7f-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-da380037dc0fe8ae61b838baf268e616057e46f8424df0a9b52f94e48cef4a7f-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=vllm/Qwen/Qwen3-0.6B-False]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375053,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-2e52800baf7e4d3389892f33feb3f52b",
              "object": "model_permission",
              "created": 1762375053,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-e42ca9261e3cee9c877322a51791ab6f113478170f8a21cd0a971c53b330e999-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-e42ca9261e3cee9c877322a51791ab6f113478170f8a21cd0a971c53b330e999-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_empty_list_error[llama_stack_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375150,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-cfec81fed838407597a92838017f3ef5",
              "object": "model_permission",
              "created": 1762375150,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-e5255919e39635597ad57c723896f9d258abaad9908b22ccd03c126ce597a5db-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-e5255919e39635597ad57c723896f9d258abaad9908b22ccd03c126ce597a5db-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374466,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-8811d359d9724f8cac7fd6df608f69bd",
              "object": "model_permission",
              "created": 1762374466,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-e6664ff0c07b13aa2af6a85925f3841eef3907bc4a55f8bc352a8c960e782ada-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-e6664ff0c07b13aa2af6a85925f3841eef3907bc4a55f8bc352a8c960e782ada-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374482,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-d4b4923adfdf40b7bd7698aa798e68eb",
              "object": "model_permission",
              "created": 1762374482,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-edbd3344609a0fa1e97f75ede14a094a34db0dd6cb52975abae9f6e7832c6760-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-edbd3344609a0fa1e97f75ede14a094a34db0dd6cb52975abae9f6e7832c6760-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-False]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375291,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-a48cfd65bcd847d7aea01d44e8add51e",
              "object": "model_permission",
              "created": 1762375291,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-f6a9f5d7181cf078717443564e4de54e08845224d96b9c8150fb5cfda2068e82-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-f6a9f5d7181cf078717443564e4de54e08845224d96b9c8150fb5cfda2068e82-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_empty_list_error[openai_client-emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762374517,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-916d53706b624fefb83e5dcc699e7a69",
              "object": "model_permission",
              "created": 1762374517,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/inference/recordings/models-f936269fe152d95db3fb80fb10482e3cc79cfd6a28ebdf1a7a8b220ba2de641b-fb68f5a6.json
+++ b/tests/integration/inference/recordings/models-f936269fe152d95db3fb80fb10482e3cc79cfd6a28ebdf1a7a8b220ba2de641b-fb68f5a6.json
@ -0,0 +1,45 @@
 {
  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=vllm/Qwen/Qwen3-0.6B-False]",
  "request": {
    "method": "POST",
    "url": "http://localhost:8000/v1/v1/models",
    "headers": {},
    "body": {},
    "endpoint": "/v1/models",
    "model": ""
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "Qwen/Qwen3-0.6B",
          "created": 1762375285,
          "object": "model",
          "owned_by": "vllm",
          "root": "/root/.cache/Qwen3-0.6B",
          "parent": null,
          "max_model_len": 8192,
          "permission": [
            {
              "id": "modelperm-e0640be42b814b3394545ebe92d844b3",
              "object": "model_permission",
              "created": 1762375285,
              "allow_create_engine": false,
              "allow_sampling": true,
              "allow_logprobs": true,
              "allow_search_indices": false,
              "allow_view": true,
              "allow_fine_tuning": false,
              "organization": "*",
              "group": null,
              "is_blocking": false
            }
          ]
        }
      }
    ],
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@ -78,7 +78,7 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
            "VLLM_URL": "http://localhost:8000/v1",
        },
        defaults={
-            "text_model": "vllm/meta-llama/Llama-3.2-1B-Instruct",
+            "text_model": "vllm/Qwen/Qwen3-0.6B",
            "embedding_model": "sentence-transformers/nomic-embed-text-v1.5",
        },
    ),
@ -169,6 +169,11 @@ SUITE_DEFINITIONS: dict[str, Suite] = {
        roots=base_roots,
        default_setup="ollama",
    ),
    "base-vllm-subset": Suite(
        name="base-vllm-subset",
        roots=["tests/integration/inference"],
        default_setup="vllm",
    ),
    "responses": Suite(
        name="responses",
        roots=["tests/integration/responses"],
--- a/tests/integration/vector_io/test_openai_vector_stores.py
+++ b/tests/integration/vector_io/test_openai_vector_stores.py
@ -350,7 +350,7 @@ def test_openai_vector_store_search_empty(
    assert search_response is not None
    assert hasattr(search_response, "data")
    assert len(search_response.data) == 0  # Empty store should return no results
-    assert search_response.search_query == "test query"
+    assert search_response.search_query == ["test query"]
    assert search_response.has_more is False
@ -679,7 +679,7 @@ def test_openai_vector_store_attach_file(
    assert file_attach_response.id == file.id
    assert file_attach_response.vector_store_id == vector_store.id
    assert file_attach_response.status == "completed"
-    assert file_attach_response.chunking_strategy.type == "auto"
+    assert file_attach_response.chunking_strategy.type == "static"
    assert file_attach_response.created_at > 0
    assert not file_attach_response.last_error
@ -815,8 +815,8 @@ def test_openai_vector_store_list_files(
    assert set(file_ids) == {file.id for file in files_list.data}
    assert files_list.data[0].object == "vector_store.file"
    assert files_list.data[0].vector_store_id == vector_store.id
-    assert files_list.data[0].status == "completed"
+    assert files_list.data[0].status in ["completed", "in_progress"]
-    assert files_list.data[0].chunking_strategy.type == "auto"
+    assert files_list.data[0].chunking_strategy.type == "static"
    assert files_list.data[0].created_at > 0
    assert files_list.first_id == files_list.data[0].id
    assert not files_list.data[0].last_error
@ -825,7 +825,7 @@ def test_openai_vector_store_list_files(
    assert first_page.has_more
    assert len(first_page.data) == 2
    assert first_page.first_id == first_page.data[0].id
-    assert first_page.last_id != first_page.data[-1].id
+    assert first_page.last_id == first_page.data[-1].id
    next_page = compat_client.vector_stores.files.list(
        vector_store_id=vector_store.id, limit=2, after=first_page.data[-1].id