Merge remote-tracking branch 'upstream/main' into feat/gunicorn-production-server

2025-12-03 18:00:36 +00:00 · 2025-10-30 16:43:34 +02:00 · 2025-10-30 16:43:34 +02:00 · b060f73e6d
commit b060f73e6d
parent 17d9ce5bfe b4ea05ada9
70 changed files with 46290 additions and 1133 deletions
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -43,6 +43,9 @@ jobs:
          cache: 'npm'
          cache-dependency-path: 'src/llama_stack/ui/'

+      - name: Set up uv
+        uses: astral-sh/setup-uv@2ddd2b9cb38ad8efd50337e8ab201519a34c9f24 # v7.1.1
+
      - name: Install npm dependencies
        run: npm ci
        working-directory: src/llama_stack/ui
@ -52,7 +55,7 @@ jobs:
        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        continue-on-error: true
        env:
-          SKIP: no-commit-to-branch
+          SKIP: no-commit-to-branch,mypy
          RUFF_OUTPUT_FORMAT: github

      - name: Check pre-commit results
@ -109,3 +112,16 @@ jobs:
            echo "$unstaged_files"
            exit 1
          fi
+
+      - name: Sync dev + type_checking dependencies
+        run: uv sync --group dev --group type_checking
+
+      - name: Run mypy (full type_checking)
+        run: |
+          set +e
+          uv run --group dev --group type_checking mypy
+          status=$?
+          if [ $status -ne 0 ]; then
+            echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
+          fi
+          exit $status
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,6 @@ CLAUDE.md
 docs/.docusaurus/
 docs/node_modules/
 docs/static/imported-files/
+docs/docs/api-deprecated/
+docs/docs/api-experimental/
+docs/docs/api/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -57,17 +57,27 @@ repos:
    hooks:
    -   id: uv-lock

-   repo: local
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.18.2
    hooks:
    -   id: mypy
-        name: mypy
        additional_dependencies:
-          - uv==0.7.8
-        entry: uv run --group dev --group type_checking mypy
-        language: python
-        types: [python]
+          - uv==0.6.2
+          - pytest
+          - rich
+          - types-requests
+          - pydantic
+          - httpx
        pass_filenames: false
-        require_serial: true
+
+-   repo: local
+    hooks:
+    -   id: mypy-full
+        name: mypy (full type_checking)
+        entry: uv run --group dev --group type_checking mypy
+        language: system
+        pass_filenames: false
+        stages: [manual]

 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
@ -152,7 +162,6 @@ repos:
        files: ^src/llama_stack/ui/.*\.(ts|tsx)$
        pass_filenames: false
        require_serial: true
-
      - id: check-log-usage
        name: Ensure 'llama_stack.log' usage for logging
        entry: bash
@ -171,7 +180,23 @@ repos:
              exit 1
            fi
            exit 0
-
+      - id: fips-compliance
+        name: Ensure llama-stack remains FIPS compliant
+        entry: bash
+        language: system
+        types: [python]
+        pass_filenames: true
+        exclude: '^tests/.*$'  # Exclude test dir as some safety tests used MD5
+        args:
+          - -c
+          - |
+            grep -EnH '^[^#]*\b(md5|sha1|uuid3|uuid5)\b' "$@" && {
+              echo;
+              echo "❌ Do not use any of the following functions: hashlib.md5, hashlib.sha1, uuid.uuid3, uuid.uuid5"
+              echo "   These functions are not FIPS-compliant"
+              echo;
+              exit 1;
+            } || true
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -61,6 +61,18 @@ uv run pre-commit run --all-files -v

 The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.

+To run the expanded mypy configuration that CI enforces, use:
+
+```bash
+uv run pre-commit run mypy-full --hook-stage manual --all-files
+```
+
+or invoke mypy directly with all optional dependencies:
+
+```bash
+uv run --group dev --group type_checking mypy
+```
+
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -1,610 +0,0 @@
-# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
-
-organization:
-  # Name of your organization or company, used to determine the name of the client
-  # and headings.
-  name: llama-stack-client
-  docs: https://llama-stack.readthedocs.io/en/latest/
-  contact: llamastack@meta.com
-security:
-  - {}
-  - BearerAuth: []
-security_schemes:
-  BearerAuth:
-    type: http
-    scheme: bearer
-# `targets` define the output targets and their customization options, such as
-# whether to emit the Node SDK and what it's package name should be.
-targets:
-  node:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-typescript
-    publish:
-      npm: false
-  python:
-    package_name: llama_stack_client
-    production_repo: llamastack/llama-stack-client-python
-    options:
-      use_uv: true
-    publish:
-      pypi: true
-    project_name: llama_stack_client
-  kotlin:
-    reverse_domain: com.llama_stack_client.api
-    production_repo: null
-    publish:
-      maven: false
-  go:
-    package_name: llama-stack-client
-    production_repo: llamastack/llama-stack-client-go
-    options:
-      enable_v2: true
-      back_compat_use_shared_package: false
-
-# `client_settings` define settings for the API client, such as extra constructor
-# arguments (used for authentication), retry behavior, idempotency, etc.
-client_settings:
-  default_env_prefix: LLAMA_STACK_CLIENT
-  opts:
-    api_key:
-      type: string
-      read_env: LLAMA_STACK_CLIENT_API_KEY
-      auth: { security_scheme: BearerAuth }
-      nullable: true
-
-# `environments` are a map of the name of the environment (e.g. "sandbox",
-# "production") to the corresponding url to use.
-environments:
-  production: http://any-hosted-llama-stack.com
-
-# `pagination` defines [pagination schemes] which provides a template to match
-# endpoints and generate next-page and auto-pagination helpers in the SDKs.
-pagination:
-  - name: datasets_iterrows
-    type: offset
-    request:
-      dataset_id:
-        type: string
-      start_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_param
-      limit:
-        type: integer
-    response:
-      data:
-        type: array
-        items:
-          type: object
-      next_index:
-        type: integer
-        x-stainless-pagination-property:
-          purpose: offset_count_start_field
-  - name: openai_cursor_page
-    type: cursor
-    request:
-      limit:
-        type: integer
-      after:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_param
-    response:
-      data:
-        type: array
-        items: {}
-      has_more:
-        type: boolean
-      last_id:
-        type: string
-        x-stainless-pagination-property:
-          purpose: next_cursor_field
-# `resources` define the structure and organziation for your API, such as how
-# methods and models are grouped together and accessed. See the [configuration
-# guide] for more information.
-#
-# [configuration guide]:
-#   https://app.stainlessapi.com/docs/guides/configure#resources
-resources:
-  $shared:
-    models:
-      agent_config: AgentConfig
-      interleaved_content_item: InterleavedContentItem
-      interleaved_content: InterleavedContent
-      param_type: ParamType
-      safety_violation: SafetyViolation
-      sampling_params: SamplingParams
-      scoring_result: ScoringResult
-      message: Message
-      user_message: UserMessage
-      completion_message: CompletionMessage
-      tool_response_message: ToolResponseMessage
-      system_message: SystemMessage
-      tool_call: ToolCall
-      query_result: RAGQueryResult
-      document: RAGDocument
-      query_config: RAGQueryConfig
-      response_format: ResponseFormat
-  toolgroups:
-    models:
-      tool_group: ToolGroup
-      list_tool_groups_response: ListToolGroupsResponse
-    methods:
-      register: post /v1/toolgroups
-      get: get /v1/toolgroups/{toolgroup_id}
-      list: get /v1/toolgroups
-      unregister: delete /v1/toolgroups/{toolgroup_id}
-  tools:
-    methods:
-      get: get /v1/tools/{tool_name}
-      list:
-        endpoint: get /v1/tools
-        paginated: false
-
-  tool_runtime:
-    models:
-      tool_def: ToolDef
-      tool_invocation_result: ToolInvocationResult
-    methods:
-      list_tools:
-        endpoint: get /v1/tool-runtime/list-tools
-        paginated: false
-      invoke_tool: post /v1/tool-runtime/invoke
-    subresources:
-      rag_tool:
-        methods:
-          insert: post /v1/tool-runtime/rag-tool/insert
-          query: post /v1/tool-runtime/rag-tool/query
-
-  responses:
-    models:
-      response_object_stream: OpenAIResponseObjectStream
-      response_object: OpenAIResponseObject
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/responses
-        streaming:
-          stream_event_model: responses.response_object_stream
-          param_discriminator: stream
-      retrieve: get /v1/responses/{response_id}
-      list:
-        type: http
-        endpoint: get /v1/responses
-      delete:
-        type: http
-        endpoint: delete /v1/responses/{response_id}
-    subresources:
-      input_items:
-        methods:
-          list:
-            type: http
-            endpoint: get /v1/responses/{response_id}/input_items
-
-  conversations:
-    models:
-      conversation_object: Conversation
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/conversations
-      retrieve: get /v1/conversations/{conversation_id}
-      update:
-        type: http
-        endpoint: post /v1/conversations/{conversation_id}
-      delete:
-        type: http
-        endpoint: delete /v1/conversations/{conversation_id}
-    subresources:
-      items:
-        methods:
-          get:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
-          list:
-            type: http
-            endpoint: get /v1/conversations/{conversation_id}/items
-          create:
-            type: http
-            endpoint: post /v1/conversations/{conversation_id}/items
-
-  inspect:
-    models:
-      healthInfo: HealthInfo
-      providerInfo: ProviderInfo
-      routeInfo: RouteInfo
-      versionInfo: VersionInfo
-    methods:
-      health: get /v1/health
-      version: get /v1/version
-
-  embeddings:
-    models:
-      create_embeddings_response: OpenAIEmbeddingsResponse
-    methods:
-      create: post /v1/embeddings
-
-  chat:
-    models:
-      chat_completion_chunk: OpenAIChatCompletionChunk
-    subresources:
-      completions:
-        methods:
-          create:
-            type: http
-            endpoint: post /v1/chat/completions
-            streaming:
-              stream_event_model: chat.chat_completion_chunk
-              param_discriminator: stream
-          list:
-            type: http
-            endpoint: get /v1/chat/completions
-          retrieve:
-            type: http
-            endpoint: get /v1/chat/completions/{completion_id}
-  completions:
-    methods:
-      create:
-        type: http
-        endpoint: post /v1/completions
-        streaming:
-          param_discriminator: stream
-
-  vector_io:
-    models:
-      queryChunksResponse: QueryChunksResponse
-    methods:
-      insert: post /v1/vector-io/insert
-      query: post /v1/vector-io/query
-
-  vector_stores:
-    models:
-      vector_store: VectorStoreObject
-      list_vector_stores_response: VectorStoreListResponse
-      vector_store_delete_response: VectorStoreDeleteResponse
-      vector_store_search_response: VectorStoreSearchResponsePage
-    methods:
-      create: post /v1/vector_stores
-      list:
-        endpoint: get /v1/vector_stores
-      retrieve: get /v1/vector_stores/{vector_store_id}
-      update: post /v1/vector_stores/{vector_store_id}
-      delete: delete /v1/vector_stores/{vector_store_id}
-      search: post /v1/vector_stores/{vector_store_id}/search
-    subresources:
-      files:
-        models:
-          vector_store_file: VectorStoreFileObject
-        methods:
-          list: get /v1/vector_stores/{vector_store_id}/files
-          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
-          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
-          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
-          create: post /v1/vector_stores/{vector_store_id}/files
-          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
-      file_batches:
-        models:
-          vector_store_file_batches: VectorStoreFileBatchObject
-          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
-        methods:
-          create: post /v1/vector_stores/{vector_store_id}/file_batches
-          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
-          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
-          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
-
-  models:
-    models:
-      model: Model
-      list_models_response: ListModelsResponse
-    methods:
-      retrieve: get /v1/models/{model_id}
-      list:
-        endpoint: get /v1/models
-        paginated: false
-      register: post /v1/models
-      unregister: delete /v1/models/{model_id}
-    subresources:
-      openai:
-        methods:
-          list:
-            endpoint: get /v1/models
-            paginated: false
-
-  providers:
-    models:
-      list_providers_response: ListProvidersResponse
-    methods:
-      list:
-        endpoint: get /v1/providers
-        paginated: false
-      retrieve: get /v1/providers/{provider_id}
-
-  routes:
-    models:
-      list_routes_response: ListRoutesResponse
-    methods:
-      list:
-        endpoint: get /v1/inspect/routes
-        paginated: false
-
-
-  moderations:
-    models:
-      create_response: ModerationObject
-    methods:
-      create: post /v1/moderations
-
-
-  safety:
-    models:
-      run_shield_response: RunShieldResponse
-    methods:
-      run_shield: post /v1/safety/run-shield
-
-
-  shields:
-    models:
-      shield: Shield
-      list_shields_response: ListShieldsResponse
-    methods:
-      retrieve: get /v1/shields/{identifier}
-      list:
-        endpoint: get /v1/shields
-        paginated: false
-      register: post /v1/shields
-      delete: delete /v1/shields/{identifier}
-
-  synthetic_data_generation:
-    models:
-      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
-    methods:
-      generate: post /v1/synthetic-data-generation/generate
-
-  telemetry:
-    models:
-      span_with_status: SpanWithStatus
-      trace: Trace
-      query_spans_response: QuerySpansResponse
-      event: Event
-      query_condition: QueryCondition
-    methods:
-      query_traces:
-        endpoint: post /v1alpha/telemetry/traces
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
-      query_spans:
-        endpoint: post /v1alpha/telemetry/spans
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      query_metrics:
-        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
-        skip_test_reason: 'unsupported query params in java / kotlin'
-      # log_event: post /v1alpha/telemetry/events
-      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
-      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
-      get_trace: get /v1alpha/telemetry/traces/{trace_id}
-
-  scoring:
-    methods:
-      score: post /v1/scoring/score
-      score_batch: post /v1/scoring/score-batch
-  scoring_functions:
-    methods:
-      retrieve: get /v1/scoring-functions/{scoring_fn_id}
-      list:
-        endpoint: get /v1/scoring-functions
-        paginated: false
-      register: post /v1/scoring-functions
-    models:
-      scoring_fn: ScoringFn
-      scoring_fn_params: ScoringFnParams
-      list_scoring_functions_response: ListScoringFunctionsResponse
-
-  benchmarks:
-    methods:
-      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
-      list:
-        endpoint: get /v1alpha/eval/benchmarks
-        paginated: false
-      register: post /v1alpha/eval/benchmarks
-    models:
-      benchmark: Benchmark
-      list_benchmarks_response: ListBenchmarksResponse
-
-  files:
-    methods:
-      create: post /v1/files
-      list: get /v1/files
-      retrieve: get /v1/files/{file_id}
-      delete: delete /v1/files/{file_id}
-      content: get /v1/files/{file_id}/content
-    models:
-      file: OpenAIFileObject
-      list_files_response: ListOpenAIFileResponse
-      delete_file_response: OpenAIFileDeleteResponse
-
-  alpha:
-    subresources:
-      inference:
-        methods:
-          rerank: post /v1alpha/inference/rerank
-
-      post_training:
-        models:
-          algorithm_config: AlgorithmConfig
-          post_training_job: PostTrainingJob
-          list_post_training_jobs_response: ListPostTrainingJobsResponse
-        methods:
-          preference_optimize: post /v1alpha/post-training/preference-optimize
-          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
-        subresources:
-          job:
-            methods:
-              artifacts: get /v1alpha/post-training/job/artifacts
-              cancel: post /v1alpha/post-training/job/cancel
-              status: get /v1alpha/post-training/job/status
-              list:
-                endpoint: get /v1alpha/post-training/jobs
-                paginated: false
-
-      eval:
-        methods:
-          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
-          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
-
-        subresources:
-          jobs:
-            methods:
-              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
-              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
-        models:
-          evaluate_response: EvaluateResponse
-          benchmark_config: BenchmarkConfig
-          job: Job
-
-      agents:
-        methods:
-          create: post /v1alpha/agents
-          list: get /v1alpha/agents
-          retrieve: get /v1alpha/agents/{agent_id}
-          delete: delete /v1alpha/agents/{agent_id}
-        models:
-          inference_step: InferenceStep
-          tool_execution_step: ToolExecutionStep
-          tool_response: ToolResponse
-          shield_call_step: ShieldCallStep
-          memory_retrieval_step: MemoryRetrievalStep
-        subresources:
-          session:
-            models:
-              session: Session
-            methods:
-              list: get /v1alpha/agents/{agent_id}/sessions
-              create: post /v1alpha/agents/{agent_id}/session
-              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
-          steps:
-            methods:
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
-          turn:
-            models:
-              turn: Turn
-              turn_response_event: AgentTurnResponseEvent
-              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
-            methods:
-              create:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
-              resume:
-                type: http
-                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
-                streaming:
-                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
-                  param_discriminator: stream
-
-  beta:
-    subresources:
-      datasets:
-        models:
-          list_datasets_response: ListDatasetsResponse
-        methods:
-          register: post /v1beta/datasets
-          retrieve: get /v1beta/datasets/{dataset_id}
-          list:
-            endpoint: get /v1beta/datasets
-            paginated: false
-          unregister: delete /v1beta/datasets/{dataset_id}
-          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
-          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
-
-
-settings:
-  license: MIT
-  unwrap_response_fields: [ data ]
-
-openapi:
-  transformations:
-    - command: renameValue
-      reason: pydantic reserved name
-      args:
-        filter:
-          only:
-            - '$.components.schemas.InferenceStep.properties.model_response'
-        rename:
-          python:
-            property_name: 'inference_model_response'
-
-    # - command: renameValue
-    #   reason: pydantic reserved name
-    #   args:
-    #     filter:
-    #       only:
-    #         - '$.components.schemas.Model.properties.model_type'
-    #     rename:
-    #       python:
-    #         property_name: 'type'
-    - command: mergeObject
-      reason: Better return_type using enum
-      args:
-        target:
-          - '$.components.schemas'
-        object:
-          ReturnType:
-            additionalProperties: false
-            properties:
-              type:
-                enum:
-                  - string
-                  - number
-                  - boolean
-                  - array
-                  - object
-                  - json
-                  - union
-                  - chat_completion_input
-                  - completion_input
-                  - agent_turn_input
-            required:
-              - type
-            type: object
-    - command: replaceProperties
-      reason: Replace return type properties with better model (see above)
-      args:
-        filter:
-          only:
-            - '$.components.schemas.ScoringFn.properties.return_type'
-            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
-        value:
-          $ref: '#/components/schemas/ReturnType'
-    - command: oneOfToAnyOf
-      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
-    - reason: For better names
-      command: extractToRefs
-      args:
-        ref:
-          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
-          name: '#/components/schemas/ToolCallOrString'
-
-# `readme` is used to configure the code snippets that will be rendered in the
-# README.md of various SDKs. In particular, you can change the `headline`
-# snippet's endpoint and the arguments to call it with.
-readme:
-  example_requests:
-    default:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: &ref_0 {}
-    headline:
-      type: request
-      endpoint: post /v1/models
-      params: *ref_0
-    pagination:
-      type: request
-      endpoint: post /v1/chat/completions
-      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -15,6 +15,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/batches:
+    get:
+      responses:
+        '200':
+          description: A list of batch objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBatchesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: List all batches for the current user.
+      description: List all batches for the current user.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            A cursor for pagination; returns batches after this batch ID.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            Number of batches to return (default 20, max 100).
+          required: true
+          schema:
+            type: integer
+      deprecated: false
+    post:
+      responses:
+        '200':
+          description: The created batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Create a new batch for processing multiple API requests.
+      description: >-
+        Create a new batch for processing multiple API requests.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateBatchRequest'
+        required: true
+      deprecated: false
+  /v1/batches/{batch_id}:
+    get:
+      responses:
+        '200':
+          description: The batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Retrieve information about a specific batch.
+      description: >-
+        Retrieve information about a specific batch.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to retrieve.
+          required: true
+          schema:
+            type: string
+      deprecated: false
+  /v1/batches/{batch_id}/cancel:
+    post:
+      responses:
+        '200':
+          description: The updated batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: Cancel a batch that is in progress.
+      description: Cancel a batch that is in progress.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to cancel.
+          required: true
+          schema:
+            type: string
+      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -4212,6 +4347,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
+    ListBatchesResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+        data:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              completion_window:
+                type: string
+              created_at:
+                type: integer
+              endpoint:
+                type: string
+              input_file_id:
+                type: string
+              object:
+                type: string
+                const: batch
+              status:
+                type: string
+                enum:
+                  - validating
+                  - failed
+                  - in_progress
+                  - finalizing
+                  - completed
+                  - expired
+                  - cancelling
+                  - cancelled
+              cancelled_at:
+                type: integer
+              cancelling_at:
+                type: integer
+              completed_at:
+                type: integer
+              error_file_id:
+                type: string
+              errors:
+                type: object
+                properties:
+                  data:
+                    type: array
+                    items:
+                      type: object
+                      properties:
+                        code:
+                          type: string
+                        line:
+                          type: integer
+                        message:
+                          type: string
+                        param:
+                          type: string
+                      additionalProperties: false
+                      title: BatchError
+                  object:
+                    type: string
+                additionalProperties: false
+                title: Errors
+              expired_at:
+                type: integer
+              expires_at:
+                type: integer
+              failed_at:
+                type: integer
+              finalizing_at:
+                type: integer
+              in_progress_at:
+                type: integer
+              metadata:
+                type: object
+                additionalProperties:
+                  type: string
+              model:
+                type: string
+              output_file_id:
+                type: string
+              request_counts:
+                type: object
+                properties:
+                  completed:
+                    type: integer
+                  failed:
+                    type: integer
+                  total:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - completed
+                  - failed
+                  - total
+                title: BatchRequestCounts
+              usage:
+                type: object
+                properties:
+                  input_tokens:
+                    type: integer
+                  input_tokens_details:
+                    type: object
+                    properties:
+                      cached_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - cached_tokens
+                    title: InputTokensDetails
+                  output_tokens:
+                    type: integer
+                  output_tokens_details:
+                    type: object
+                    properties:
+                      reasoning_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - reasoning_tokens
+                    title: OutputTokensDetails
+                  total_tokens:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - input_tokens
+                  - input_tokens_details
+                  - output_tokens
+                  - output_tokens_details
+                  - total_tokens
+                title: BatchUsage
+            additionalProperties: false
+            required:
+              - id
+              - completion_window
+              - created_at
+              - endpoint
+              - input_file_id
+              - object
+              - status
+            title: Batch
+        first_id:
+          type: string
+        last_id:
+          type: string
+        has_more:
+          type: boolean
+          default: false
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - has_more
+      title: ListBatchesResponse
+      description: >-
+        Response containing a list of batch objects.
+    CreateBatchRequest:
+      type: object
+      properties:
+        input_file_id:
+          type: string
+          description: >-
+            The ID of an uploaded file containing requests for the batch.
+        endpoint:
+          type: string
+          description: >-
+            The endpoint to be used for all requests in the batch.
+        completion_window:
+          type: string
+          const: 24h
+          description: >-
+            The time window within which the batch should be processed.
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+          description: Optional metadata for the batch.
+        idempotency_key:
+          type: string
+          description: >-
+            Optional idempotency key. When provided, enables idempotent behavior.
+      additionalProperties: false
+      required:
+        - input_file_id
+        - endpoint
+        - completion_window
+      title: CreateBatchRequest
+    Batch:
+      type: object
+      properties:
+        id:
+          type: string
+        completion_window:
+          type: string
+        created_at:
+          type: integer
+        endpoint:
+          type: string
+        input_file_id:
+          type: string
+        object:
+          type: string
+          const: batch
+        status:
+          type: string
+          enum:
+            - validating
+            - failed
+            - in_progress
+            - finalizing
+            - completed
+            - expired
+            - cancelling
+            - cancelled
+        cancelled_at:
+          type: integer
+        cancelling_at:
+          type: integer
+        completed_at:
+          type: integer
+        error_file_id:
+          type: string
+        errors:
+          type: object
+          properties:
+            data:
+              type: array
+              items:
+                type: object
+                properties:
+                  code:
+                    type: string
+                  line:
+                    type: integer
+                  message:
+                    type: string
+                  param:
+                    type: string
+                additionalProperties: false
+                title: BatchError
+            object:
+              type: string
+          additionalProperties: false
+          title: Errors
+        expired_at:
+          type: integer
+        expires_at:
+          type: integer
+        failed_at:
+          type: integer
+        finalizing_at:
+          type: integer
+        in_progress_at:
+          type: integer
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+        model:
+          type: string
+        output_file_id:
+          type: string
+        request_counts:
+          type: object
+          properties:
+            completed:
+              type: integer
+            failed:
+              type: integer
+            total:
+              type: integer
+          additionalProperties: false
+          required:
+            - completed
+            - failed
+            - total
+          title: BatchRequestCounts
+        usage:
+          type: object
+          properties:
+            input_tokens:
+              type: integer
+            input_tokens_details:
+              type: object
+              properties:
+                cached_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - cached_tokens
+              title: InputTokensDetails
+            output_tokens:
+              type: integer
+            output_tokens_details:
+              type: object
+              properties:
+                reasoning_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - reasoning_tokens
+              title: OutputTokensDetails
+            total_tokens:
+              type: integer
+          additionalProperties: false
+          required:
+            - input_tokens
+            - input_tokens_details
+            - output_tokens
+            - output_tokens_details
+            - total_tokens
+          title: BatchUsage
+      additionalProperties: false
+      required:
+        - id
+        - completion_window
+        - created_at
+        - endpoint
+        - input_file_id
+        - object
+        - status
+      title: Batch
    Order:
      type: string
      enum:
@ -10258,6 +10718,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
+        chunk_id:
+          type: string
+          description: >-
+            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -10278,10 +10742,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
-        stored_chunk_id:
-          type: string
-          description: >-
-            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -10290,6 +10750,7 @@ components:
      additionalProperties: false
      required:
        - content
+        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -13527,6 +13988,19 @@ tags:
    description: >-
      APIs for creating and interacting with agentic systems.
    x-displayName: Agents
+  - name: Batches
+    description: >-
+      The API is designed to allow use of openai client libraries for seamless integration.
+
+
+      This API provides the following extensions:
+       - idempotent batch creation
+
+      Note: This API is currently under active development and may undergo changes.
+    x-displayName: >-
+      The Batches API enables efficient processing of multiple requests in a single
+      operation, particularly useful for processing large datasets, batch evaluation
+      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: Conversations
@ -13601,6 +14075,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
+      - Batches
      - Benchmarks
      - Conversations
      - DatasetIO
--- a/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
+++ b/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@ -242,15 +242,6 @@ const sidebars: SidebarsConfig = {
            'providers/eval/remote_nvidia'
          ],
        },
-        {
-          type: 'category',
-          label: 'Telemetry',
-          collapsed: true,
-          items: [
-            'providers/telemetry/index',
-            'providers/telemetry/inline_meta-reference'
-          ],
-        },
        {
          type: 'category',
          label: 'Batches',
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -1414,6 +1414,193 @@
                "deprecated": true
            }
        },
+        "/v1/openai/v1/batches": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A list of batch objects.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBatchesResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "List all batches for the current user.",
+                "description": "List all batches for the current user.",
+                "parameters": [
+                    {
+                        "name": "after",
+                        "in": "query",
+                        "description": "A cursor for pagination; returns batches after this batch ID.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "Number of batches to return (default 20, max 100).",
+                        "required": true,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    }
+                ],
+                "deprecated": true
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The created batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Create a new batch for processing multiple API requests.",
+                "description": "Create a new batch for processing multiple API requests.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateBatchRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": true
+            }
+        },
+        "/v1/openai/v1/batches/{batch_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Retrieve information about a specific batch.",
+                "description": "Retrieve information about a specific batch.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to retrieve.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
+        "/v1/openai/v1/batches/{batch_id}/cancel": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The updated batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Cancel a batch that is in progress.",
+                "description": "Cancel a batch that is in progress.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to cancel.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": true
+            }
+        },
        "/v1/openai/v1/chat/completions": {
            "get": {
                "responses": {
@ -6401,6 +6588,451 @@
                "title": "Job",
                "description": "A job execution instance with status tracking."
            },
+            "ListBatchesResponse": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list"
+                    },
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "id": {
+                                    "type": "string"
+                                },
+                                "completion_window": {
+                                    "type": "string"
+                                },
+                                "created_at": {
+                                    "type": "integer"
+                                },
+                                "endpoint": {
+                                    "type": "string"
+                                },
+                                "input_file_id": {
+                                    "type": "string"
+                                },
+                                "object": {
+                                    "type": "string",
+                                    "const": "batch"
+                                },
+                                "status": {
+                                    "type": "string",
+                                    "enum": [
+                                        "validating",
+                                        "failed",
+                                        "in_progress",
+                                        "finalizing",
+                                        "completed",
+                                        "expired",
+                                        "cancelling",
+                                        "cancelled"
+                                    ]
+                                },
+                                "cancelled_at": {
+                                    "type": "integer"
+                                },
+                                "cancelling_at": {
+                                    "type": "integer"
+                                },
+                                "completed_at": {
+                                    "type": "integer"
+                                },
+                                "error_file_id": {
+                                    "type": "string"
+                                },
+                                "errors": {
+                                    "type": "object",
+                                    "properties": {
+                                        "data": {
+                                            "type": "array",
+                                            "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                    "code": {
+                                                        "type": "string"
+                                                    },
+                                                    "line": {
+                                                        "type": "integer"
+                                                    },
+                                                    "message": {
+                                                        "type": "string"
+                                                    },
+                                                    "param": {
+                                                        "type": "string"
+                                                    }
+                                                },
+                                                "additionalProperties": false,
+                                                "title": "BatchError"
+                                            }
+                                        },
+                                        "object": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "Errors"
+                                },
+                                "expired_at": {
+                                    "type": "integer"
+                                },
+                                "expires_at": {
+                                    "type": "integer"
+                                },
+                                "failed_at": {
+                                    "type": "integer"
+                                },
+                                "finalizing_at": {
+                                    "type": "integer"
+                                },
+                                "in_progress_at": {
+                                    "type": "integer"
+                                },
+                                "metadata": {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "type": "string"
+                                    }
+                                },
+                                "model": {
+                                    "type": "string"
+                                },
+                                "output_file_id": {
+                                    "type": "string"
+                                },
+                                "request_counts": {
+                                    "type": "object",
+                                    "properties": {
+                                        "completed": {
+                                            "type": "integer"
+                                        },
+                                        "failed": {
+                                            "type": "integer"
+                                        },
+                                        "total": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "completed",
+                                        "failed",
+                                        "total"
+                                    ],
+                                    "title": "BatchRequestCounts"
+                                },
+                                "usage": {
+                                    "type": "object",
+                                    "properties": {
+                                        "input_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "input_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "cached_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "cached_tokens"
+                                            ],
+                                            "title": "InputTokensDetails"
+                                        },
+                                        "output_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "output_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "reasoning_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "reasoning_tokens"
+                                            ],
+                                            "title": "OutputTokensDetails"
+                                        },
+                                        "total_tokens": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "input_tokens",
+                                        "input_tokens_details",
+                                        "output_tokens",
+                                        "output_tokens_details",
+                                        "total_tokens"
+                                    ],
+                                    "title": "BatchUsage"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "id",
+                                "completion_window",
+                                "created_at",
+                                "endpoint",
+                                "input_file_id",
+                                "object",
+                                "status"
+                            ],
+                            "title": "Batch"
+                        }
+                    },
+                    "first_id": {
+                        "type": "string"
+                    },
+                    "last_id": {
+                        "type": "string"
+                    },
+                    "has_more": {
+                        "type": "boolean",
+                        "default": false
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "data",
+                    "has_more"
+                ],
+                "title": "ListBatchesResponse",
+                "description": "Response containing a list of batch objects."
+            },
+            "CreateBatchRequest": {
+                "type": "object",
+                "properties": {
+                    "input_file_id": {
+                        "type": "string",
+                        "description": "The ID of an uploaded file containing requests for the batch."
+                    },
+                    "endpoint": {
+                        "type": "string",
+                        "description": "The endpoint to be used for all requests in the batch."
+                    },
+                    "completion_window": {
+                        "type": "string",
+                        "const": "24h",
+                        "description": "The time window within which the batch should be processed."
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        },
+                        "description": "Optional metadata for the batch."
+                    },
+                    "idempotency_key": {
+                        "type": "string",
+                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_file_id",
+                    "endpoint",
+                    "completion_window"
+                ],
+                "title": "CreateBatchRequest"
+            },
+            "Batch": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "completion_window": {
+                        "type": "string"
+                    },
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "endpoint": {
+                        "type": "string"
+                    },
+                    "input_file_id": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "batch"
+                    },
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "validating",
+                            "failed",
+                            "in_progress",
+                            "finalizing",
+                            "completed",
+                            "expired",
+                            "cancelling",
+                            "cancelled"
+                        ]
+                    },
+                    "cancelled_at": {
+                        "type": "integer"
+                    },
+                    "cancelling_at": {
+                        "type": "integer"
+                    },
+                    "completed_at": {
+                        "type": "integer"
+                    },
+                    "error_file_id": {
+                        "type": "string"
+                    },
+                    "errors": {
+                        "type": "object",
+                        "properties": {
+                            "data": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "code": {
+                                            "type": "string"
+                                        },
+                                        "line": {
+                                            "type": "integer"
+                                        },
+                                        "message": {
+                                            "type": "string"
+                                        },
+                                        "param": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "BatchError"
+                                }
+                            },
+                            "object": {
+                                "type": "string"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "Errors"
+                    },
+                    "expired_at": {
+                        "type": "integer"
+                    },
+                    "expires_at": {
+                        "type": "integer"
+                    },
+                    "failed_at": {
+                        "type": "integer"
+                    },
+                    "finalizing_at": {
+                        "type": "integer"
+                    },
+                    "in_progress_at": {
+                        "type": "integer"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "output_file_id": {
+                        "type": "string"
+                    },
+                    "request_counts": {
+                        "type": "object",
+                        "properties": {
+                            "completed": {
+                                "type": "integer"
+                            },
+                            "failed": {
+                                "type": "integer"
+                            },
+                            "total": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "completed",
+                            "failed",
+                            "total"
+                        ],
+                        "title": "BatchRequestCounts"
+                    },
+                    "usage": {
+                        "type": "object",
+                        "properties": {
+                            "input_tokens": {
+                                "type": "integer"
+                            },
+                            "input_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "cached_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "cached_tokens"
+                                ],
+                                "title": "InputTokensDetails"
+                            },
+                            "output_tokens": {
+                                "type": "integer"
+                            },
+                            "output_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "reasoning_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "reasoning_tokens"
+                                ],
+                                "title": "OutputTokensDetails"
+                            },
+                            "total_tokens": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "input_tokens",
+                            "input_tokens_details",
+                            "output_tokens",
+                            "output_tokens_details",
+                            "total_tokens"
+                        ],
+                        "title": "BatchUsage"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "completion_window",
+                    "created_at",
+                    "endpoint",
+                    "input_file_id",
+                    "object",
+                    "status"
+                ],
+                "title": "Batch"
+            },
            "Order": {
                "type": "string",
                "enum": [
@ -13505,6 +14137,11 @@
            "description": "APIs for creating and interacting with agentic systems.\n\n## Deprecated APIs\n\n> **⚠️ DEPRECATED**: These APIs are provided for migration reference and will be removed in future versions. Not recommended for new projects.\n\n### Migration Guidance\n\nIf you are using deprecated versions of the Agents or Responses APIs, please migrate to:\n\n- **Responses API**: Use the stable v1 Responses API endpoints\n",
            "x-displayName": "Agents"
        },
+        {
+            "name": "Batches",
+            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
+            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
+        },
        {
            "name": "Benchmarks",
            "description": ""
@ -13555,6 +14192,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
+                "Batches",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -1012,6 +1012,141 @@ paths:
          schema:
            type: string
      deprecated: true
+  /v1/openai/v1/batches:
+    get:
+      responses:
+        '200':
+          description: A list of batch objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBatchesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: List all batches for the current user.
+      description: List all batches for the current user.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            A cursor for pagination; returns batches after this batch ID.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            Number of batches to return (default 20, max 100).
+          required: true
+          schema:
+            type: integer
+      deprecated: true
+    post:
+      responses:
+        '200':
+          description: The created batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Create a new batch for processing multiple API requests.
+      description: >-
+        Create a new batch for processing multiple API requests.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateBatchRequest'
+        required: true
+      deprecated: true
+  /v1/openai/v1/batches/{batch_id}:
+    get:
+      responses:
+        '200':
+          description: The batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Retrieve information about a specific batch.
+      description: >-
+        Retrieve information about a specific batch.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to retrieve.
+          required: true
+          schema:
+            type: string
+      deprecated: true
+  /v1/openai/v1/batches/{batch_id}/cancel:
+    post:
+      responses:
+        '200':
+          description: The updated batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: Cancel a batch that is in progress.
+      description: Cancel a batch that is in progress.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to cancel.
+          required: true
+          schema:
+            type: string
+      deprecated: true
  /v1/openai/v1/chat/completions:
    get:
      responses:
@ -4736,6 +4871,331 @@ components:
      title: Job
      description: >-
        A job execution instance with status tracking.
+    ListBatchesResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+        data:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              completion_window:
+                type: string
+              created_at:
+                type: integer
+              endpoint:
+                type: string
+              input_file_id:
+                type: string
+              object:
+                type: string
+                const: batch
+              status:
+                type: string
+                enum:
+                  - validating
+                  - failed
+                  - in_progress
+                  - finalizing
+                  - completed
+                  - expired
+                  - cancelling
+                  - cancelled
+              cancelled_at:
+                type: integer
+              cancelling_at:
+                type: integer
+              completed_at:
+                type: integer
+              error_file_id:
+                type: string
+              errors:
+                type: object
+                properties:
+                  data:
+                    type: array
+                    items:
+                      type: object
+                      properties:
+                        code:
+                          type: string
+                        line:
+                          type: integer
+                        message:
+                          type: string
+                        param:
+                          type: string
+                      additionalProperties: false
+                      title: BatchError
+                  object:
+                    type: string
+                additionalProperties: false
+                title: Errors
+              expired_at:
+                type: integer
+              expires_at:
+                type: integer
+              failed_at:
+                type: integer
+              finalizing_at:
+                type: integer
+              in_progress_at:
+                type: integer
+              metadata:
+                type: object
+                additionalProperties:
+                  type: string
+              model:
+                type: string
+              output_file_id:
+                type: string
+              request_counts:
+                type: object
+                properties:
+                  completed:
+                    type: integer
+                  failed:
+                    type: integer
+                  total:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - completed
+                  - failed
+                  - total
+                title: BatchRequestCounts
+              usage:
+                type: object
+                properties:
+                  input_tokens:
+                    type: integer
+                  input_tokens_details:
+                    type: object
+                    properties:
+                      cached_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - cached_tokens
+                    title: InputTokensDetails
+                  output_tokens:
+                    type: integer
+                  output_tokens_details:
+                    type: object
+                    properties:
+                      reasoning_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - reasoning_tokens
+                    title: OutputTokensDetails
+                  total_tokens:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - input_tokens
+                  - input_tokens_details
+                  - output_tokens
+                  - output_tokens_details
+                  - total_tokens
+                title: BatchUsage
+            additionalProperties: false
+            required:
+              - id
+              - completion_window
+              - created_at
+              - endpoint
+              - input_file_id
+              - object
+              - status
+            title: Batch
+        first_id:
+          type: string
+        last_id:
+          type: string
+        has_more:
+          type: boolean
+          default: false
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - has_more
+      title: ListBatchesResponse
+      description: >-
+        Response containing a list of batch objects.
+    CreateBatchRequest:
+      type: object
+      properties:
+        input_file_id:
+          type: string
+          description: >-
+            The ID of an uploaded file containing requests for the batch.
+        endpoint:
+          type: string
+          description: >-
+            The endpoint to be used for all requests in the batch.
+        completion_window:
+          type: string
+          const: 24h
+          description: >-
+            The time window within which the batch should be processed.
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+          description: Optional metadata for the batch.
+        idempotency_key:
+          type: string
+          description: >-
+            Optional idempotency key. When provided, enables idempotent behavior.
+      additionalProperties: false
+      required:
+        - input_file_id
+        - endpoint
+        - completion_window
+      title: CreateBatchRequest
+    Batch:
+      type: object
+      properties:
+        id:
+          type: string
+        completion_window:
+          type: string
+        created_at:
+          type: integer
+        endpoint:
+          type: string
+        input_file_id:
+          type: string
+        object:
+          type: string
+          const: batch
+        status:
+          type: string
+          enum:
+            - validating
+            - failed
+            - in_progress
+            - finalizing
+            - completed
+            - expired
+            - cancelling
+            - cancelled
+        cancelled_at:
+          type: integer
+        cancelling_at:
+          type: integer
+        completed_at:
+          type: integer
+        error_file_id:
+          type: string
+        errors:
+          type: object
+          properties:
+            data:
+              type: array
+              items:
+                type: object
+                properties:
+                  code:
+                    type: string
+                  line:
+                    type: integer
+                  message:
+                    type: string
+                  param:
+                    type: string
+                additionalProperties: false
+                title: BatchError
+            object:
+              type: string
+          additionalProperties: false
+          title: Errors
+        expired_at:
+          type: integer
+        expires_at:
+          type: integer
+        failed_at:
+          type: integer
+        finalizing_at:
+          type: integer
+        in_progress_at:
+          type: integer
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+        model:
+          type: string
+        output_file_id:
+          type: string
+        request_counts:
+          type: object
+          properties:
+            completed:
+              type: integer
+            failed:
+              type: integer
+            total:
+              type: integer
+          additionalProperties: false
+          required:
+            - completed
+            - failed
+            - total
+          title: BatchRequestCounts
+        usage:
+          type: object
+          properties:
+            input_tokens:
+              type: integer
+            input_tokens_details:
+              type: object
+              properties:
+                cached_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - cached_tokens
+              title: InputTokensDetails
+            output_tokens:
+              type: integer
+            output_tokens_details:
+              type: object
+              properties:
+                reasoning_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - reasoning_tokens
+              title: OutputTokensDetails
+            total_tokens:
+              type: integer
+          additionalProperties: false
+          required:
+            - input_tokens
+            - input_tokens_details
+            - output_tokens
+            - output_tokens_details
+            - total_tokens
+          title: BatchUsage
+      additionalProperties: false
+      required:
+        - id
+        - completion_window
+        - created_at
+        - endpoint
+        - input_file_id
+        - object
+        - status
+      title: Batch
    Order:
      type: string
      enum:
@ -10263,6 +10723,19 @@ tags:

      - **Responses API**: Use the stable v1 Responses API endpoints
    x-displayName: Agents
+  - name: Batches
+    description: >-
+      The API is designed to allow use of openai client libraries for seamless integration.
+
+
+      This API provides the following extensions:
+       - idempotent batch creation
+
+      Note: This API is currently under active development and may undergo changes.
+    x-displayName: >-
+      The Batches API enables efficient processing of multiple requests in a single
+      operation, particularly useful for processing large datasets, batch evaluation
+      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: DatasetIO
@ -10308,6 +10781,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
+      - Batches
      - Benchmarks
      - DatasetIO
      - Datasets
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -40,6 +40,193 @@
        }
    ],
    "paths": {
+        "/v1/batches": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A list of batch objects.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBatchesResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "List all batches for the current user.",
+                "description": "List all batches for the current user.",
+                "parameters": [
+                    {
+                        "name": "after",
+                        "in": "query",
+                        "description": "A cursor for pagination; returns batches after this batch ID.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "Number of batches to return (default 20, max 100).",
+                        "required": true,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    }
+                ],
+                "deprecated": false
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The created batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Create a new batch for processing multiple API requests.",
+                "description": "Create a new batch for processing multiple API requests.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateBatchRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": false
+            }
+        },
+        "/v1/batches/{batch_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Retrieve information about a specific batch.",
+                "description": "Retrieve information about a specific batch.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to retrieve.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": false
+            }
+        },
+        "/v1/batches/{batch_id}/cancel": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The updated batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Cancel a batch that is in progress.",
+                "description": "Cancel a batch that is in progress.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to cancel.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": false
+            }
+        },
        "/v1/chat/completions": {
            "get": {
                "responses": {
@ -4005,6 +4192,451 @@
                "title": "Error",
                "description": "Error response from the API. Roughly follows RFC 7807."
            },
+            "ListBatchesResponse": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list"
+                    },
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "id": {
+                                    "type": "string"
+                                },
+                                "completion_window": {
+                                    "type": "string"
+                                },
+                                "created_at": {
+                                    "type": "integer"
+                                },
+                                "endpoint": {
+                                    "type": "string"
+                                },
+                                "input_file_id": {
+                                    "type": "string"
+                                },
+                                "object": {
+                                    "type": "string",
+                                    "const": "batch"
+                                },
+                                "status": {
+                                    "type": "string",
+                                    "enum": [
+                                        "validating",
+                                        "failed",
+                                        "in_progress",
+                                        "finalizing",
+                                        "completed",
+                                        "expired",
+                                        "cancelling",
+                                        "cancelled"
+                                    ]
+                                },
+                                "cancelled_at": {
+                                    "type": "integer"
+                                },
+                                "cancelling_at": {
+                                    "type": "integer"
+                                },
+                                "completed_at": {
+                                    "type": "integer"
+                                },
+                                "error_file_id": {
+                                    "type": "string"
+                                },
+                                "errors": {
+                                    "type": "object",
+                                    "properties": {
+                                        "data": {
+                                            "type": "array",
+                                            "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                    "code": {
+                                                        "type": "string"
+                                                    },
+                                                    "line": {
+                                                        "type": "integer"
+                                                    },
+                                                    "message": {
+                                                        "type": "string"
+                                                    },
+                                                    "param": {
+                                                        "type": "string"
+                                                    }
+                                                },
+                                                "additionalProperties": false,
+                                                "title": "BatchError"
+                                            }
+                                        },
+                                        "object": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "Errors"
+                                },
+                                "expired_at": {
+                                    "type": "integer"
+                                },
+                                "expires_at": {
+                                    "type": "integer"
+                                },
+                                "failed_at": {
+                                    "type": "integer"
+                                },
+                                "finalizing_at": {
+                                    "type": "integer"
+                                },
+                                "in_progress_at": {
+                                    "type": "integer"
+                                },
+                                "metadata": {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "type": "string"
+                                    }
+                                },
+                                "model": {
+                                    "type": "string"
+                                },
+                                "output_file_id": {
+                                    "type": "string"
+                                },
+                                "request_counts": {
+                                    "type": "object",
+                                    "properties": {
+                                        "completed": {
+                                            "type": "integer"
+                                        },
+                                        "failed": {
+                                            "type": "integer"
+                                        },
+                                        "total": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "completed",
+                                        "failed",
+                                        "total"
+                                    ],
+                                    "title": "BatchRequestCounts"
+                                },
+                                "usage": {
+                                    "type": "object",
+                                    "properties": {
+                                        "input_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "input_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "cached_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "cached_tokens"
+                                            ],
+                                            "title": "InputTokensDetails"
+                                        },
+                                        "output_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "output_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "reasoning_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "reasoning_tokens"
+                                            ],
+                                            "title": "OutputTokensDetails"
+                                        },
+                                        "total_tokens": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "input_tokens",
+                                        "input_tokens_details",
+                                        "output_tokens",
+                                        "output_tokens_details",
+                                        "total_tokens"
+                                    ],
+                                    "title": "BatchUsage"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "id",
+                                "completion_window",
+                                "created_at",
+                                "endpoint",
+                                "input_file_id",
+                                "object",
+                                "status"
+                            ],
+                            "title": "Batch"
+                        }
+                    },
+                    "first_id": {
+                        "type": "string"
+                    },
+                    "last_id": {
+                        "type": "string"
+                    },
+                    "has_more": {
+                        "type": "boolean",
+                        "default": false
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "data",
+                    "has_more"
+                ],
+                "title": "ListBatchesResponse",
+                "description": "Response containing a list of batch objects."
+            },
+            "CreateBatchRequest": {
+                "type": "object",
+                "properties": {
+                    "input_file_id": {
+                        "type": "string",
+                        "description": "The ID of an uploaded file containing requests for the batch."
+                    },
+                    "endpoint": {
+                        "type": "string",
+                        "description": "The endpoint to be used for all requests in the batch."
+                    },
+                    "completion_window": {
+                        "type": "string",
+                        "const": "24h",
+                        "description": "The time window within which the batch should be processed."
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        },
+                        "description": "Optional metadata for the batch."
+                    },
+                    "idempotency_key": {
+                        "type": "string",
+                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_file_id",
+                    "endpoint",
+                    "completion_window"
+                ],
+                "title": "CreateBatchRequest"
+            },
+            "Batch": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "completion_window": {
+                        "type": "string"
+                    },
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "endpoint": {
+                        "type": "string"
+                    },
+                    "input_file_id": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "batch"
+                    },
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "validating",
+                            "failed",
+                            "in_progress",
+                            "finalizing",
+                            "completed",
+                            "expired",
+                            "cancelling",
+                            "cancelled"
+                        ]
+                    },
+                    "cancelled_at": {
+                        "type": "integer"
+                    },
+                    "cancelling_at": {
+                        "type": "integer"
+                    },
+                    "completed_at": {
+                        "type": "integer"
+                    },
+                    "error_file_id": {
+                        "type": "string"
+                    },
+                    "errors": {
+                        "type": "object",
+                        "properties": {
+                            "data": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "code": {
+                                            "type": "string"
+                                        },
+                                        "line": {
+                                            "type": "integer"
+                                        },
+                                        "message": {
+                                            "type": "string"
+                                        },
+                                        "param": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "BatchError"
+                                }
+                            },
+                            "object": {
+                                "type": "string"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "Errors"
+                    },
+                    "expired_at": {
+                        "type": "integer"
+                    },
+                    "expires_at": {
+                        "type": "integer"
+                    },
+                    "failed_at": {
+                        "type": "integer"
+                    },
+                    "finalizing_at": {
+                        "type": "integer"
+                    },
+                    "in_progress_at": {
+                        "type": "integer"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "output_file_id": {
+                        "type": "string"
+                    },
+                    "request_counts": {
+                        "type": "object",
+                        "properties": {
+                            "completed": {
+                                "type": "integer"
+                            },
+                            "failed": {
+                                "type": "integer"
+                            },
+                            "total": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "completed",
+                            "failed",
+                            "total"
+                        ],
+                        "title": "BatchRequestCounts"
+                    },
+                    "usage": {
+                        "type": "object",
+                        "properties": {
+                            "input_tokens": {
+                                "type": "integer"
+                            },
+                            "input_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "cached_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "cached_tokens"
+                                ],
+                                "title": "InputTokensDetails"
+                            },
+                            "output_tokens": {
+                                "type": "integer"
+                            },
+                            "output_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "reasoning_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "reasoning_tokens"
+                                ],
+                                "title": "OutputTokensDetails"
+                            },
+                            "total_tokens": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "input_tokens",
+                            "input_tokens_details",
+                            "output_tokens",
+                            "output_tokens_details",
+                            "total_tokens"
+                        ],
+                        "title": "BatchUsage"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "completion_window",
+                    "created_at",
+                    "endpoint",
+                    "input_file_id",
+                    "object",
+                    "status"
+                ],
+                "title": "Batch"
+            },
            "Order": {
                "type": "string",
                "enum": [
@ -11897,6 +12529,10 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the chunk, which can be interleaved text, images, or other types."
                    },
+                    "chunk_id": {
+                        "type": "string",
+                        "description": "Unique identifier for the chunk. Must be provided explicitly."
+                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
@ -11930,10 +12566,6 @@
                        },
                        "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                    },
-                    "stored_chunk_id": {
-                        "type": "string",
-                        "description": "The chunk ID that is stored in the vector database. Used for backend functionality."
-                    },
                    "chunk_metadata": {
                        "$ref": "#/components/schemas/ChunkMetadata",
                        "description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
@ -11942,6 +12574,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
+                    "chunk_id",
                    "metadata"
                ],
                "title": "Chunk",
@ -13288,6 +13921,11 @@
            "description": "APIs for creating and interacting with agentic systems.\n\n## Responses API\n\nThe Responses API provides OpenAI-compatible functionality with enhanced capabilities for dynamic, stateful interactions.\n\n> **✅ STABLE**: This API is production-ready with backward compatibility guarantees. Recommended for production applications.\n\n### ✅ Supported Tools\n\nThe Responses API supports the following tool types:\n\n- **`web_search`**: Search the web for current information and real-time data\n- **`file_search`**: Search through uploaded files and vector stores\n  - Supports dynamic `vector_store_ids` per call\n  - Compatible with OpenAI file search patterns\n- **`function`**: Call custom functions with JSON schema validation\n- **`mcp_tool`**: Model Context Protocol integration\n\n### ✅ Supported Fields & Features\n\n**Core Capabilities:**\n- **Dynamic Configuration**: Switch models, vector stores, and tools per request without pre-configuration\n- **Conversation Branching**: Use `previous_response_id` to branch conversations and explore different paths\n- **Rich Annotations**: Automatic file citations, URL citations, and container file citations\n- **Status Tracking**: Monitor tool call execution status and handle failures gracefully\n\n### 🚧 Work in Progress\n\n- Full real-time response streaming support\n- `tool_choice` parameter\n- `max_tool_calls` parameter\n- Built-in tools (code interpreter, containers API)\n- Safety & guardrails\n- `reasoning` capabilities\n- `service_tier`\n- `logprobs`\n- `max_output_tokens`\n- `metadata` handling\n- `instructions`\n- `incomplete_details`\n- `background`",
            "x-displayName": "Agents"
        },
+        {
+            "name": "Batches",
+            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
+            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
+        },
        {
            "name": "Conversations",
            "description": "Protocol for conversation management operations.",
@ -13361,6 +13999,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
+                "Batches",
                "Conversations",
                "Files",
                "Inference",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -12,6 +12,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/batches:
+    get:
+      responses:
+        '200':
+          description: A list of batch objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBatchesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: List all batches for the current user.
+      description: List all batches for the current user.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            A cursor for pagination; returns batches after this batch ID.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            Number of batches to return (default 20, max 100).
+          required: true
+          schema:
+            type: integer
+      deprecated: false
+    post:
+      responses:
+        '200':
+          description: The created batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Create a new batch for processing multiple API requests.
+      description: >-
+        Create a new batch for processing multiple API requests.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateBatchRequest'
+        required: true
+      deprecated: false
+  /v1/batches/{batch_id}:
+    get:
+      responses:
+        '200':
+          description: The batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Retrieve information about a specific batch.
+      description: >-
+        Retrieve information about a specific batch.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to retrieve.
+          required: true
+          schema:
+            type: string
+      deprecated: false
+  /v1/batches/{batch_id}/cancel:
+    post:
+      responses:
+        '200':
+          description: The updated batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: Cancel a batch that is in progress.
+      description: Cancel a batch that is in progress.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to cancel.
+          required: true
+          schema:
+            type: string
+      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -2999,6 +3134,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
+    ListBatchesResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+        data:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              completion_window:
+                type: string
+              created_at:
+                type: integer
+              endpoint:
+                type: string
+              input_file_id:
+                type: string
+              object:
+                type: string
+                const: batch
+              status:
+                type: string
+                enum:
+                  - validating
+                  - failed
+                  - in_progress
+                  - finalizing
+                  - completed
+                  - expired
+                  - cancelling
+                  - cancelled
+              cancelled_at:
+                type: integer
+              cancelling_at:
+                type: integer
+              completed_at:
+                type: integer
+              error_file_id:
+                type: string
+              errors:
+                type: object
+                properties:
+                  data:
+                    type: array
+                    items:
+                      type: object
+                      properties:
+                        code:
+                          type: string
+                        line:
+                          type: integer
+                        message:
+                          type: string
+                        param:
+                          type: string
+                      additionalProperties: false
+                      title: BatchError
+                  object:
+                    type: string
+                additionalProperties: false
+                title: Errors
+              expired_at:
+                type: integer
+              expires_at:
+                type: integer
+              failed_at:
+                type: integer
+              finalizing_at:
+                type: integer
+              in_progress_at:
+                type: integer
+              metadata:
+                type: object
+                additionalProperties:
+                  type: string
+              model:
+                type: string
+              output_file_id:
+                type: string
+              request_counts:
+                type: object
+                properties:
+                  completed:
+                    type: integer
+                  failed:
+                    type: integer
+                  total:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - completed
+                  - failed
+                  - total
+                title: BatchRequestCounts
+              usage:
+                type: object
+                properties:
+                  input_tokens:
+                    type: integer
+                  input_tokens_details:
+                    type: object
+                    properties:
+                      cached_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - cached_tokens
+                    title: InputTokensDetails
+                  output_tokens:
+                    type: integer
+                  output_tokens_details:
+                    type: object
+                    properties:
+                      reasoning_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - reasoning_tokens
+                    title: OutputTokensDetails
+                  total_tokens:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - input_tokens
+                  - input_tokens_details
+                  - output_tokens
+                  - output_tokens_details
+                  - total_tokens
+                title: BatchUsage
+            additionalProperties: false
+            required:
+              - id
+              - completion_window
+              - created_at
+              - endpoint
+              - input_file_id
+              - object
+              - status
+            title: Batch
+        first_id:
+          type: string
+        last_id:
+          type: string
+        has_more:
+          type: boolean
+          default: false
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - has_more
+      title: ListBatchesResponse
+      description: >-
+        Response containing a list of batch objects.
+    CreateBatchRequest:
+      type: object
+      properties:
+        input_file_id:
+          type: string
+          description: >-
+            The ID of an uploaded file containing requests for the batch.
+        endpoint:
+          type: string
+          description: >-
+            The endpoint to be used for all requests in the batch.
+        completion_window:
+          type: string
+          const: 24h
+          description: >-
+            The time window within which the batch should be processed.
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+          description: Optional metadata for the batch.
+        idempotency_key:
+          type: string
+          description: >-
+            Optional idempotency key. When provided, enables idempotent behavior.
+      additionalProperties: false
+      required:
+        - input_file_id
+        - endpoint
+        - completion_window
+      title: CreateBatchRequest
+    Batch:
+      type: object
+      properties:
+        id:
+          type: string
+        completion_window:
+          type: string
+        created_at:
+          type: integer
+        endpoint:
+          type: string
+        input_file_id:
+          type: string
+        object:
+          type: string
+          const: batch
+        status:
+          type: string
+          enum:
+            - validating
+            - failed
+            - in_progress
+            - finalizing
+            - completed
+            - expired
+            - cancelling
+            - cancelled
+        cancelled_at:
+          type: integer
+        cancelling_at:
+          type: integer
+        completed_at:
+          type: integer
+        error_file_id:
+          type: string
+        errors:
+          type: object
+          properties:
+            data:
+              type: array
+              items:
+                type: object
+                properties:
+                  code:
+                    type: string
+                  line:
+                    type: integer
+                  message:
+                    type: string
+                  param:
+                    type: string
+                additionalProperties: false
+                title: BatchError
+            object:
+              type: string
+          additionalProperties: false
+          title: Errors
+        expired_at:
+          type: integer
+        expires_at:
+          type: integer
+        failed_at:
+          type: integer
+        finalizing_at:
+          type: integer
+        in_progress_at:
+          type: integer
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+        model:
+          type: string
+        output_file_id:
+          type: string
+        request_counts:
+          type: object
+          properties:
+            completed:
+              type: integer
+            failed:
+              type: integer
+            total:
+              type: integer
+          additionalProperties: false
+          required:
+            - completed
+            - failed
+            - total
+          title: BatchRequestCounts
+        usage:
+          type: object
+          properties:
+            input_tokens:
+              type: integer
+            input_tokens_details:
+              type: object
+              properties:
+                cached_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - cached_tokens
+              title: InputTokensDetails
+            output_tokens:
+              type: integer
+            output_tokens_details:
+              type: object
+              properties:
+                reasoning_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - reasoning_tokens
+              title: OutputTokensDetails
+            total_tokens:
+              type: integer
+          additionalProperties: false
+          required:
+            - input_tokens
+            - input_tokens_details
+            - output_tokens
+            - output_tokens_details
+            - total_tokens
+          title: BatchUsage
+      additionalProperties: false
+      required:
+        - id
+        - completion_window
+        - created_at
+        - endpoint
+        - input_file_id
+        - object
+        - status
+      title: Batch
    Order:
      type: string
      enum:
@ -9045,6 +9505,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
+        chunk_id:
+          type: string
+          description: >-
+            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -9065,10 +9529,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
-        stored_chunk_id:
-          type: string
-          description: >-
-            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -9077,6 +9537,7 @@ components:
      additionalProperties: false
      required:
        - content
+        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -10143,6 +10604,19 @@ tags:

      - `background`
    x-displayName: Agents
+  - name: Batches
+    description: >-
+      The API is designed to allow use of openai client libraries for seamless integration.
+
+
+      This API provides the following extensions:
+       - idempotent batch creation
+
+      Note: This API is currently under active development and may undergo changes.
+    x-displayName: >-
+      The Batches API enables efficient processing of multiple requests in a single
+      operation, particularly useful for processing large datasets, batch evaluation
+      workflows, and cost-effective inference at scale.
  - name: Conversations
    description: >-
      Protocol for conversation management operations.
@ -10205,6 +10679,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
+      - Batches
      - Conversations
      - Files
      - Inference
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -40,6 +40,193 @@
        }
    ],
    "paths": {
+        "/v1/batches": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A list of batch objects.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListBatchesResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "List all batches for the current user.",
+                "description": "List all batches for the current user.",
+                "parameters": [
+                    {
+                        "name": "after",
+                        "in": "query",
+                        "description": "A cursor for pagination; returns batches after this batch ID.",
+                        "required": false,
+                        "schema": {
+                            "type": "string"
+                        }
+                    },
+                    {
+                        "name": "limit",
+                        "in": "query",
+                        "description": "Number of batches to return (default 20, max 100).",
+                        "required": true,
+                        "schema": {
+                            "type": "integer"
+                        }
+                    }
+                ],
+                "deprecated": false
+            },
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The created batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Create a new batch for processing multiple API requests.",
+                "description": "Create a new batch for processing multiple API requests.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateBatchRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "deprecated": false
+            }
+        },
+        "/v1/batches/{batch_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "The batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Retrieve information about a specific batch.",
+                "description": "Retrieve information about a specific batch.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to retrieve.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": false
+            }
+        },
+        "/v1/batches/{batch_id}/cancel": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "The updated batch object.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Batch"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Batches"
+                ],
+                "summary": "Cancel a batch that is in progress.",
+                "description": "Cancel a batch that is in progress.",
+                "parameters": [
+                    {
+                        "name": "batch_id",
+                        "in": "path",
+                        "description": "The ID of the batch to cancel.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "deprecated": false
+            }
+        },
        "/v1/chat/completions": {
            "get": {
                "responses": {
@ -5677,6 +5864,451 @@
                "title": "Error",
                "description": "Error response from the API. Roughly follows RFC 7807."
            },
+            "ListBatchesResponse": {
+                "type": "object",
+                "properties": {
+                    "object": {
+                        "type": "string",
+                        "const": "list",
+                        "default": "list"
+                    },
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "id": {
+                                    "type": "string"
+                                },
+                                "completion_window": {
+                                    "type": "string"
+                                },
+                                "created_at": {
+                                    "type": "integer"
+                                },
+                                "endpoint": {
+                                    "type": "string"
+                                },
+                                "input_file_id": {
+                                    "type": "string"
+                                },
+                                "object": {
+                                    "type": "string",
+                                    "const": "batch"
+                                },
+                                "status": {
+                                    "type": "string",
+                                    "enum": [
+                                        "validating",
+                                        "failed",
+                                        "in_progress",
+                                        "finalizing",
+                                        "completed",
+                                        "expired",
+                                        "cancelling",
+                                        "cancelled"
+                                    ]
+                                },
+                                "cancelled_at": {
+                                    "type": "integer"
+                                },
+                                "cancelling_at": {
+                                    "type": "integer"
+                                },
+                                "completed_at": {
+                                    "type": "integer"
+                                },
+                                "error_file_id": {
+                                    "type": "string"
+                                },
+                                "errors": {
+                                    "type": "object",
+                                    "properties": {
+                                        "data": {
+                                            "type": "array",
+                                            "items": {
+                                                "type": "object",
+                                                "properties": {
+                                                    "code": {
+                                                        "type": "string"
+                                                    },
+                                                    "line": {
+                                                        "type": "integer"
+                                                    },
+                                                    "message": {
+                                                        "type": "string"
+                                                    },
+                                                    "param": {
+                                                        "type": "string"
+                                                    }
+                                                },
+                                                "additionalProperties": false,
+                                                "title": "BatchError"
+                                            }
+                                        },
+                                        "object": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "Errors"
+                                },
+                                "expired_at": {
+                                    "type": "integer"
+                                },
+                                "expires_at": {
+                                    "type": "integer"
+                                },
+                                "failed_at": {
+                                    "type": "integer"
+                                },
+                                "finalizing_at": {
+                                    "type": "integer"
+                                },
+                                "in_progress_at": {
+                                    "type": "integer"
+                                },
+                                "metadata": {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "type": "string"
+                                    }
+                                },
+                                "model": {
+                                    "type": "string"
+                                },
+                                "output_file_id": {
+                                    "type": "string"
+                                },
+                                "request_counts": {
+                                    "type": "object",
+                                    "properties": {
+                                        "completed": {
+                                            "type": "integer"
+                                        },
+                                        "failed": {
+                                            "type": "integer"
+                                        },
+                                        "total": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "completed",
+                                        "failed",
+                                        "total"
+                                    ],
+                                    "title": "BatchRequestCounts"
+                                },
+                                "usage": {
+                                    "type": "object",
+                                    "properties": {
+                                        "input_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "input_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "cached_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "cached_tokens"
+                                            ],
+                                            "title": "InputTokensDetails"
+                                        },
+                                        "output_tokens": {
+                                            "type": "integer"
+                                        },
+                                        "output_tokens_details": {
+                                            "type": "object",
+                                            "properties": {
+                                                "reasoning_tokens": {
+                                                    "type": "integer"
+                                                }
+                                            },
+                                            "additionalProperties": false,
+                                            "required": [
+                                                "reasoning_tokens"
+                                            ],
+                                            "title": "OutputTokensDetails"
+                                        },
+                                        "total_tokens": {
+                                            "type": "integer"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "required": [
+                                        "input_tokens",
+                                        "input_tokens_details",
+                                        "output_tokens",
+                                        "output_tokens_details",
+                                        "total_tokens"
+                                    ],
+                                    "title": "BatchUsage"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "id",
+                                "completion_window",
+                                "created_at",
+                                "endpoint",
+                                "input_file_id",
+                                "object",
+                                "status"
+                            ],
+                            "title": "Batch"
+                        }
+                    },
+                    "first_id": {
+                        "type": "string"
+                    },
+                    "last_id": {
+                        "type": "string"
+                    },
+                    "has_more": {
+                        "type": "boolean",
+                        "default": false
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "object",
+                    "data",
+                    "has_more"
+                ],
+                "title": "ListBatchesResponse",
+                "description": "Response containing a list of batch objects."
+            },
+            "CreateBatchRequest": {
+                "type": "object",
+                "properties": {
+                    "input_file_id": {
+                        "type": "string",
+                        "description": "The ID of an uploaded file containing requests for the batch."
+                    },
+                    "endpoint": {
+                        "type": "string",
+                        "description": "The endpoint to be used for all requests in the batch."
+                    },
+                    "completion_window": {
+                        "type": "string",
+                        "const": "24h",
+                        "description": "The time window within which the batch should be processed."
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        },
+                        "description": "Optional metadata for the batch."
+                    },
+                    "idempotency_key": {
+                        "type": "string",
+                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input_file_id",
+                    "endpoint",
+                    "completion_window"
+                ],
+                "title": "CreateBatchRequest"
+            },
+            "Batch": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "completion_window": {
+                        "type": "string"
+                    },
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "endpoint": {
+                        "type": "string"
+                    },
+                    "input_file_id": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "batch"
+                    },
+                    "status": {
+                        "type": "string",
+                        "enum": [
+                            "validating",
+                            "failed",
+                            "in_progress",
+                            "finalizing",
+                            "completed",
+                            "expired",
+                            "cancelling",
+                            "cancelled"
+                        ]
+                    },
+                    "cancelled_at": {
+                        "type": "integer"
+                    },
+                    "cancelling_at": {
+                        "type": "integer"
+                    },
+                    "completed_at": {
+                        "type": "integer"
+                    },
+                    "error_file_id": {
+                        "type": "string"
+                    },
+                    "errors": {
+                        "type": "object",
+                        "properties": {
+                            "data": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "code": {
+                                            "type": "string"
+                                        },
+                                        "line": {
+                                            "type": "integer"
+                                        },
+                                        "message": {
+                                            "type": "string"
+                                        },
+                                        "param": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "additionalProperties": false,
+                                    "title": "BatchError"
+                                }
+                            },
+                            "object": {
+                                "type": "string"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "title": "Errors"
+                    },
+                    "expired_at": {
+                        "type": "integer"
+                    },
+                    "expires_at": {
+                        "type": "integer"
+                    },
+                    "failed_at": {
+                        "type": "integer"
+                    },
+                    "finalizing_at": {
+                        "type": "integer"
+                    },
+                    "in_progress_at": {
+                        "type": "integer"
+                    },
+                    "metadata": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "string"
+                        }
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "output_file_id": {
+                        "type": "string"
+                    },
+                    "request_counts": {
+                        "type": "object",
+                        "properties": {
+                            "completed": {
+                                "type": "integer"
+                            },
+                            "failed": {
+                                "type": "integer"
+                            },
+                            "total": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "completed",
+                            "failed",
+                            "total"
+                        ],
+                        "title": "BatchRequestCounts"
+                    },
+                    "usage": {
+                        "type": "object",
+                        "properties": {
+                            "input_tokens": {
+                                "type": "integer"
+                            },
+                            "input_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "cached_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "cached_tokens"
+                                ],
+                                "title": "InputTokensDetails"
+                            },
+                            "output_tokens": {
+                                "type": "integer"
+                            },
+                            "output_tokens_details": {
+                                "type": "object",
+                                "properties": {
+                                    "reasoning_tokens": {
+                                        "type": "integer"
+                                    }
+                                },
+                                "additionalProperties": false,
+                                "required": [
+                                    "reasoning_tokens"
+                                ],
+                                "title": "OutputTokensDetails"
+                            },
+                            "total_tokens": {
+                                "type": "integer"
+                            }
+                        },
+                        "additionalProperties": false,
+                        "required": [
+                            "input_tokens",
+                            "input_tokens_details",
+                            "output_tokens",
+                            "output_tokens_details",
+                            "total_tokens"
+                        ],
+                        "title": "BatchUsage"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "completion_window",
+                    "created_at",
+                    "endpoint",
+                    "input_file_id",
+                    "object",
+                    "status"
+                ],
+                "title": "Batch"
+            },
            "Order": {
                "type": "string",
                "enum": [
@ -13569,6 +14201,10 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the chunk, which can be interleaved text, images, or other types."
                    },
+                    "chunk_id": {
+                        "type": "string",
+                        "description": "Unique identifier for the chunk. Must be provided explicitly."
+                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
@ -13602,10 +14238,6 @@
                        },
                        "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                    },
-                    "stored_chunk_id": {
-                        "type": "string",
-                        "description": "The chunk ID that is stored in the vector database. Used for backend functionality."
-                    },
                    "chunk_metadata": {
                        "$ref": "#/components/schemas/ChunkMetadata",
                        "description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
@ -13614,6 +14246,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
+                    "chunk_id",
                    "metadata"
                ],
                "title": "Chunk",
@ -17960,6 +18593,11 @@
            "description": "APIs for creating and interacting with agentic systems.",
            "x-displayName": "Agents"
        },
+        {
+            "name": "Batches",
+            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
+            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
+        },
        {
            "name": "Benchmarks",
            "description": ""
@ -18054,6 +18692,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
+                "Batches",
                "Benchmarks",
                "Conversations",
                "DatasetIO",
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -15,6 +15,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
+  /v1/batches:
+    get:
+      responses:
+        '200':
+          description: A list of batch objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListBatchesResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: List all batches for the current user.
+      description: List all batches for the current user.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            A cursor for pagination; returns batches after this batch ID.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            Number of batches to return (default 20, max 100).
+          required: true
+          schema:
+            type: integer
+      deprecated: false
+    post:
+      responses:
+        '200':
+          description: The created batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Create a new batch for processing multiple API requests.
+      description: >-
+        Create a new batch for processing multiple API requests.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateBatchRequest'
+        required: true
+      deprecated: false
+  /v1/batches/{batch_id}:
+    get:
+      responses:
+        '200':
+          description: The batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: >-
+        Retrieve information about a specific batch.
+      description: >-
+        Retrieve information about a specific batch.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to retrieve.
+          required: true
+          schema:
+            type: string
+      deprecated: false
+  /v1/batches/{batch_id}/cancel:
+    post:
+      responses:
+        '200':
+          description: The updated batch object.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Batch'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Batches
+      summary: Cancel a batch that is in progress.
+      description: Cancel a batch that is in progress.
+      parameters:
+        - name: batch_id
+          in: path
+          description: The ID of the batch to cancel.
+          required: true
+          schema:
+            type: string
+      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -4212,6 +4347,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
+    ListBatchesResponse:
+      type: object
+      properties:
+        object:
+          type: string
+          const: list
+          default: list
+        data:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+              completion_window:
+                type: string
+              created_at:
+                type: integer
+              endpoint:
+                type: string
+              input_file_id:
+                type: string
+              object:
+                type: string
+                const: batch
+              status:
+                type: string
+                enum:
+                  - validating
+                  - failed
+                  - in_progress
+                  - finalizing
+                  - completed
+                  - expired
+                  - cancelling
+                  - cancelled
+              cancelled_at:
+                type: integer
+              cancelling_at:
+                type: integer
+              completed_at:
+                type: integer
+              error_file_id:
+                type: string
+              errors:
+                type: object
+                properties:
+                  data:
+                    type: array
+                    items:
+                      type: object
+                      properties:
+                        code:
+                          type: string
+                        line:
+                          type: integer
+                        message:
+                          type: string
+                        param:
+                          type: string
+                      additionalProperties: false
+                      title: BatchError
+                  object:
+                    type: string
+                additionalProperties: false
+                title: Errors
+              expired_at:
+                type: integer
+              expires_at:
+                type: integer
+              failed_at:
+                type: integer
+              finalizing_at:
+                type: integer
+              in_progress_at:
+                type: integer
+              metadata:
+                type: object
+                additionalProperties:
+                  type: string
+              model:
+                type: string
+              output_file_id:
+                type: string
+              request_counts:
+                type: object
+                properties:
+                  completed:
+                    type: integer
+                  failed:
+                    type: integer
+                  total:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - completed
+                  - failed
+                  - total
+                title: BatchRequestCounts
+              usage:
+                type: object
+                properties:
+                  input_tokens:
+                    type: integer
+                  input_tokens_details:
+                    type: object
+                    properties:
+                      cached_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - cached_tokens
+                    title: InputTokensDetails
+                  output_tokens:
+                    type: integer
+                  output_tokens_details:
+                    type: object
+                    properties:
+                      reasoning_tokens:
+                        type: integer
+                    additionalProperties: false
+                    required:
+                      - reasoning_tokens
+                    title: OutputTokensDetails
+                  total_tokens:
+                    type: integer
+                additionalProperties: false
+                required:
+                  - input_tokens
+                  - input_tokens_details
+                  - output_tokens
+                  - output_tokens_details
+                  - total_tokens
+                title: BatchUsage
+            additionalProperties: false
+            required:
+              - id
+              - completion_window
+              - created_at
+              - endpoint
+              - input_file_id
+              - object
+              - status
+            title: Batch
+        first_id:
+          type: string
+        last_id:
+          type: string
+        has_more:
+          type: boolean
+          default: false
+      additionalProperties: false
+      required:
+        - object
+        - data
+        - has_more
+      title: ListBatchesResponse
+      description: >-
+        Response containing a list of batch objects.
+    CreateBatchRequest:
+      type: object
+      properties:
+        input_file_id:
+          type: string
+          description: >-
+            The ID of an uploaded file containing requests for the batch.
+        endpoint:
+          type: string
+          description: >-
+            The endpoint to be used for all requests in the batch.
+        completion_window:
+          type: string
+          const: 24h
+          description: >-
+            The time window within which the batch should be processed.
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+          description: Optional metadata for the batch.
+        idempotency_key:
+          type: string
+          description: >-
+            Optional idempotency key. When provided, enables idempotent behavior.
+      additionalProperties: false
+      required:
+        - input_file_id
+        - endpoint
+        - completion_window
+      title: CreateBatchRequest
+    Batch:
+      type: object
+      properties:
+        id:
+          type: string
+        completion_window:
+          type: string
+        created_at:
+          type: integer
+        endpoint:
+          type: string
+        input_file_id:
+          type: string
+        object:
+          type: string
+          const: batch
+        status:
+          type: string
+          enum:
+            - validating
+            - failed
+            - in_progress
+            - finalizing
+            - completed
+            - expired
+            - cancelling
+            - cancelled
+        cancelled_at:
+          type: integer
+        cancelling_at:
+          type: integer
+        completed_at:
+          type: integer
+        error_file_id:
+          type: string
+        errors:
+          type: object
+          properties:
+            data:
+              type: array
+              items:
+                type: object
+                properties:
+                  code:
+                    type: string
+                  line:
+                    type: integer
+                  message:
+                    type: string
+                  param:
+                    type: string
+                additionalProperties: false
+                title: BatchError
+            object:
+              type: string
+          additionalProperties: false
+          title: Errors
+        expired_at:
+          type: integer
+        expires_at:
+          type: integer
+        failed_at:
+          type: integer
+        finalizing_at:
+          type: integer
+        in_progress_at:
+          type: integer
+        metadata:
+          type: object
+          additionalProperties:
+            type: string
+        model:
+          type: string
+        output_file_id:
+          type: string
+        request_counts:
+          type: object
+          properties:
+            completed:
+              type: integer
+            failed:
+              type: integer
+            total:
+              type: integer
+          additionalProperties: false
+          required:
+            - completed
+            - failed
+            - total
+          title: BatchRequestCounts
+        usage:
+          type: object
+          properties:
+            input_tokens:
+              type: integer
+            input_tokens_details:
+              type: object
+              properties:
+                cached_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - cached_tokens
+              title: InputTokensDetails
+            output_tokens:
+              type: integer
+            output_tokens_details:
+              type: object
+              properties:
+                reasoning_tokens:
+                  type: integer
+              additionalProperties: false
+              required:
+                - reasoning_tokens
+              title: OutputTokensDetails
+            total_tokens:
+              type: integer
+          additionalProperties: false
+          required:
+            - input_tokens
+            - input_tokens_details
+            - output_tokens
+            - output_tokens_details
+            - total_tokens
+          title: BatchUsage
+      additionalProperties: false
+      required:
+        - id
+        - completion_window
+        - created_at
+        - endpoint
+        - input_file_id
+        - object
+        - status
+      title: Batch
    Order:
      type: string
      enum:
@ -10258,6 +10718,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
+        chunk_id:
+          type: string
+          description: >-
+            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -10278,10 +10742,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
-        stored_chunk_id:
-          type: string
-          description: >-
-            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -10290,6 +10750,7 @@ components:
      additionalProperties: false
      required:
        - content
+        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -13527,6 +13988,19 @@ tags:
    description: >-
      APIs for creating and interacting with agentic systems.
    x-displayName: Agents
+  - name: Batches
+    description: >-
+      The API is designed to allow use of openai client libraries for seamless integration.
+
+
+      This API provides the following extensions:
+       - idempotent batch creation
+
+      Note: This API is currently under active development and may undergo changes.
+    x-displayName: >-
+      The Batches API enables efficient processing of multiple requests in a single
+      operation, particularly useful for processing large datasets, batch evaluation
+      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: Conversations
@ -13601,6 +14075,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
+      - Batches
      - Benchmarks
      - Conversations
      - DatasetIO
--- a/pyproject.toml
+++ b/pyproject.toml
@ -285,7 +285,6 @@ exclude = [
    "^src/llama_stack/models/llama/llama3/interface\\.py$",
    "^src/llama_stack/models/llama/llama3/tokenizer\\.py$",
    "^src/llama_stack/models/llama/llama3/tool_utils\\.py$",
-    "^src/llama_stack/providers/inline/agents/meta_reference/",
    "^src/llama_stack/providers/inline/datasetio/localfs/",
    "^src/llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
    "^src/llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -313,8 +313,20 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    fi
    echo "Using image: $IMAGE_NAME"

-    docker run -d --network host --name "$container_name" \
-        -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+    # On macOS/Darwin, --network host doesn't work as expected due to Docker running in a VM
+    # Use regular port mapping instead
+    NETWORK_MODE=""
+    PORT_MAPPINGS=""
+    if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
+        NETWORK_MODE="--network host"
+    else
+        # On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
+        PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
+        echo "Using bridge networking with port mapping (non-Linux)"
+    fi
+
+    docker run -d $NETWORK_MODE --name "$container_name" \
+        $PORT_MAPPINGS \
        $DOCKER_ENV_VARS \
        "$IMAGE_NAME" \
        --port $LLAMA_STACK_PORT
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from collections.abc import Sequence
 from typing import Annotated, Any, Literal

 from pydantic import BaseModel, Field, model_validator
@ -202,7 +203,7 @@ class OpenAIResponseMessage(BaseModel):
    scenarios.
    """

-    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    content: str | Sequence[OpenAIResponseInputMessageContent] | Sequence[OpenAIResponseOutputMessageContent]
    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
    type: Literal["message"] = "message"

@ -254,10 +255,10 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """

    id: str
-    queries: list[str]
+    queries: Sequence[str]
    status: str
    type: Literal["file_search_call"] = "file_search_call"
-    results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
+    results: Sequence[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None


@json_schema_type
@ -597,7 +598,7 @@ class OpenAIResponseObject(BaseModel):
    id: str
    model: str
    object: Literal["response"] = "response"
-    output: list[OpenAIResponseOutput]
+    output: Sequence[OpenAIResponseOutput]
    parallel_tool_calls: bool = False
    previous_response_id: str | None = None
    prompt: OpenAIResponsePrompt | None = None
@ -607,7 +608,7 @@ class OpenAIResponseObject(BaseModel):
    # before the field was added. New responses will have this set always.
    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
    top_p: float | None = None
-    tools: list[OpenAIResponseTool] | None = None
+    tools: Sequence[OpenAIResponseTool] | None = None
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
@ -1315,7 +1316,7 @@ class ListOpenAIResponseInputItem(BaseModel):
    :param object: Object type identifier, always "list"
    """

-    data: list[OpenAIResponseInput]
+    data: Sequence[OpenAIResponseInput]
    object: Literal["list"] = "list"


@ -1326,7 +1327,7 @@ class OpenAIResponseObjectWithInput(OpenAIResponseObject):
    :param input: List of input items that led to this response
    """

-    input: list[OpenAIResponseInput]
+    input: Sequence[OpenAIResponseInput]

    def to_response_object(self) -> OpenAIResponseObject:
        """Convert to OpenAIResponseObject by excluding input field."""
@ -1344,7 +1345,7 @@ class ListOpenAIResponseObject(BaseModel):
    :param object: Object type identifier, always "list"
    """

-    data: list[OpenAIResponseObjectWithInput]
+    data: Sequence[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
    last_id: str
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -8,7 +8,6 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import uuid
 from typing import Annotated, Any, Literal, Protocol, runtime_checkable

 from fastapi import Body
@ -18,7 +17,6 @@ from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -61,38 +59,19 @@ class Chunk(BaseModel):
    """
    A chunk of content that can be inserted into a vector database.
    :param content: The content of the chunk, which can be interleaved text, images, or other types.
-    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
+    :param chunk_id: Unique identifier for the chunk. Must be provided explicitly.
    :param metadata: Metadata associated with the chunk that will be used in the model context during inference.
-    :param stored_chunk_id: The chunk ID that is stored in the vector database. Used for backend functionality.
+    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
    :param chunk_metadata: Metadata for the chunk that will NOT be used in the context during inference.
        The `chunk_metadata` is required backend functionality.
    """

    content: InterleavedContent
+    chunk_id: str
    metadata: dict[str, Any] = Field(default_factory=dict)
    embedding: list[float] | None = None
-    # The alias parameter serializes the field as "chunk_id" in JSON but keeps the internal name as "stored_chunk_id"
-    stored_chunk_id: str | None = Field(default=None, alias="chunk_id")
    chunk_metadata: ChunkMetadata | None = None

-    model_config = {"populate_by_name": True}
-
-    def model_post_init(self, __context):
-        # Extract chunk_id from metadata if present
-        if self.metadata and "chunk_id" in self.metadata:
-            self.stored_chunk_id = self.metadata.pop("chunk_id")
-
-    @property
-    def chunk_id(self) -> str:
-        """Returns the chunk ID, which is either an input `chunk_id` or a generated one if not set."""
-        if self.stored_chunk_id:
-            return self.stored_chunk_id
-
-        if "document_id" in self.metadata:
-            return generate_chunk_id(self.metadata["document_id"], str(self.content))
-
-        return generate_chunk_id(str(uuid.uuid4()), str(self.content))
-
    @property
    def document_id(self) -> str | None:
        """Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
--- a/src/llama_stack/core/routing_tables/models.py
+++ b/src/llama_stack/core/routing_tables/models.py
@ -13,6 +13,8 @@ from llama_stack.core.datatypes import (
    ModelWithOwner,
    RegistryEntrySource,
 )
+from llama_stack.core.request_headers import PROVIDER_DATA_VAR, NeedsRequestProviderData
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl, lookup_model
@ -42,11 +44,90 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):

            await self.update_registered_models(provider_id, models)

+    async def _get_dynamic_models_from_provider_data(self) -> list[Model]:
+        """
+        Fetch models from providers that have credentials in the current request's provider_data.
+
+        This allows users to see models available to them from providers that require
+        per-request API keys (via X-LlamaStack-Provider-Data header).
+
+        Returns models with fully qualified identifiers (provider_id/model_id) but does NOT
+        cache them in the registry since they are user-specific.
+        """
+        provider_data = PROVIDER_DATA_VAR.get()
+        if not provider_data:
+            return []
+
+        dynamic_models = []
+
+        for provider_id, provider in self.impls_by_provider_id.items():
+            # Check if this provider supports provider_data
+            if not isinstance(provider, NeedsRequestProviderData):
+                continue
+
+            # Check if provider has a validator (some providers like ollama don't need per-request credentials)
+            spec = getattr(provider, "__provider_spec__", None)
+            if not spec or not getattr(spec, "provider_data_validator", None):
+                continue
+
+            # Validate provider_data silently - we're speculatively checking all providers
+            # so validation failures are expected when user didn't provide keys for this provider
+            try:
+                validator = instantiate_class_type(spec.provider_data_validator)
+                validator(**provider_data)
+            except Exception:
+                # User didn't provide credentials for this provider - skip silently
+                continue
+
+            # Validation succeeded! User has credentials for this provider
+            # Now try to list models
+            try:
+                models = await provider.list_models()
+                if not models:
+                    continue
+
+                # Ensure models have fully qualified identifiers with provider_id prefix
+                for model in models:
+                    # Only add prefix if model identifier doesn't already have it
+                    if not model.identifier.startswith(f"{provider_id}/"):
+                        model.identifier = f"{provider_id}/{model.provider_resource_id}"
+
+                    dynamic_models.append(model)
+
+                logger.debug(f"Fetched {len(models)} models from provider {provider_id} using provider_data")
+
+            except Exception as e:
+                logger.debug(f"Failed to list models from provider {provider_id} with provider_data: {e}")
+                continue
+
+        return dynamic_models
+
    async def list_models(self) -> ListModelsResponse:
-        return ListModelsResponse(data=await self.get_all_with_type("model"))
+        # Get models from registry
+        registry_models = await self.get_all_with_type("model")
+
+        # Get additional models available via provider_data (user-specific, not cached)
+        dynamic_models = await self._get_dynamic_models_from_provider_data()
+
+        # Combine, avoiding duplicates (registry takes precedence)
+        registry_identifiers = {m.identifier for m in registry_models}
+        unique_dynamic_models = [m for m in dynamic_models if m.identifier not in registry_identifiers]
+
+        return ListModelsResponse(data=registry_models + unique_dynamic_models)

    async def openai_list_models(self) -> OpenAIListModelsResponse:
-        models = await self.get_all_with_type("model")
+        # Get models from registry
+        registry_models = await self.get_all_with_type("model")
+
+        # Get additional models available via provider_data (user-specific, not cached)
+        dynamic_models = await self._get_dynamic_models_from_provider_data()
+
+        # Combine, avoiding duplicates (registry takes precedence)
+        registry_identifiers = {m.identifier for m in registry_models}
+        unique_dynamic_models = [m for m in dynamic_models if m.identifier not in registry_identifiers]
+
+        all_models = registry_models + unique_dynamic_models
+
        openai_models = [
            OpenAIModel(
                id=model.identifier,
@ -54,7 +135,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
                created=int(time.time()),
                owned_by="llama_stack",
            )
-            for model in models
+            for model in all_models
        ]
        return OpenAIListModelsResponse(data=openai_models)

--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -14,6 +14,7 @@ from typing import Any
 import yaml

 from llama_stack.apis.agents import Agents
+from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.conversations import Conversations
 from llama_stack.apis.datasetio import DatasetIO
@ -63,6 +64,7 @@ class LlamaStack(
    Providers,
    Inference,
    Agents,
+    Batches,
    Safety,
    SyntheticDataGeneration,
    Datasets,
--- a/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -11,6 +11,7 @@ import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from datetime import UTC, datetime
+from typing import Any, cast

 import httpx

@ -125,12 +126,12 @@ class ChatAgent(ShieldRunnerMixin):
        )

    def turn_to_messages(self, turn: Turn) -> list[Message]:
-        messages = []
+        messages: list[Message] = []

        # NOTE: if a toolcall response is in a step, we do not add it when processing the input messages
        tool_call_ids = set()
        for step in turn.steps:
-            if step.step_type == StepType.tool_execution.value:
+            if step.step_type == StepType.tool_execution.value and isinstance(step, ToolExecutionStep):
                for response in step.tool_responses:
                    tool_call_ids.add(response.call_id)

@ -149,9 +150,9 @@ class ChatAgent(ShieldRunnerMixin):
            messages.append(msg)

        for step in turn.steps:
-            if step.step_type == StepType.inference.value:
+            if step.step_type == StepType.inference.value and isinstance(step, InferenceStep):
                messages.append(step.model_response)
-            elif step.step_type == StepType.tool_execution.value:
+            elif step.step_type == StepType.tool_execution.value and isinstance(step, ToolExecutionStep):
                for response in step.tool_responses:
                    messages.append(
                        ToolResponseMessage(
@ -159,8 +160,8 @@ class ChatAgent(ShieldRunnerMixin):
                            content=response.content,
                        )
                    )
-            elif step.step_type == StepType.shield_call.value:
-                if step.violation:
+            elif step.step_type == StepType.shield_call.value and isinstance(step, ShieldCallStep):
+                if step.violation and step.violation.user_message:
                    # CompletionMessage itself in the ShieldResponse
                    messages.append(
                        CompletionMessage(
@ -174,7 +175,7 @@ class ChatAgent(ShieldRunnerMixin):
        return await self.storage.create_session(name)

    async def get_messages_from_turns(self, turns: list[Turn]) -> list[Message]:
-        messages = []
+        messages: list[Message] = []
        if self.agent_config.instructions != "":
            messages.append(SystemMessage(content=self.agent_config.instructions))

@ -231,7 +232,9 @@ class ChatAgent(ShieldRunnerMixin):

        steps = []
        messages = await self.get_messages_from_turns(turns)
+
        if is_resume:
+            assert isinstance(request, AgentTurnResumeRequest)
            tool_response_messages = [
                ToolResponseMessage(call_id=x.call_id, content=x.content) for x in request.tool_responses
            ]
@ -252,42 +255,52 @@ class ChatAgent(ShieldRunnerMixin):
            in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step(
                request.session_id, request.turn_id
            )
-            now = datetime.now(UTC).isoformat()
+            now_dt = datetime.now(UTC)
            tool_execution_step = ToolExecutionStep(
                step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                turn_id=request.turn_id,
                tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []),
                tool_responses=request.tool_responses,
-                completed_at=now,
-                started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now),
+                completed_at=now_dt,
+                started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now_dt),
            )
            steps.append(tool_execution_step)
            yield AgentTurnResponseStreamChunk(
                event=AgentTurnResponseEvent(
                    payload=AgentTurnResponseStepCompletePayload(
-                        step_type=StepType.tool_execution.value,
+                        step_type=StepType.tool_execution,
                        step_id=tool_execution_step.step_id,
                        step_details=tool_execution_step,
                    )
                )
            )
-            input_messages = last_turn.input_messages
+            # Cast needed due to list invariance - last_turn.input_messages is the right type
+            input_messages = last_turn.input_messages  # type: ignore[assignment]

-            turn_id = request.turn_id
+            actual_turn_id = request.turn_id
            start_time = last_turn.started_at
        else:
+            assert isinstance(request, AgentTurnCreateRequest)
            messages.extend(request.messages)
-            start_time = datetime.now(UTC).isoformat()
-            input_messages = request.messages
+            start_time = datetime.now(UTC)
+            # Cast needed due to list invariance - request.messages is the right type
+            input_messages = request.messages  # type: ignore[assignment]
+            # Use the generated turn_id from beginning of function
+            actual_turn_id = turn_id if turn_id else str(uuid.uuid4())

        output_message = None
+        req_documents = request.documents if isinstance(request, AgentTurnCreateRequest) and not is_resume else None
+        req_sampling = (
+            self.agent_config.sampling_params if self.agent_config.sampling_params is not None else SamplingParams()
+        )
+
        async for chunk in self.run(
            session_id=request.session_id,
-            turn_id=turn_id,
+            turn_id=actual_turn_id,
            input_messages=messages,
-            sampling_params=self.agent_config.sampling_params,
+            sampling_params=req_sampling,
            stream=request.stream,
-            documents=request.documents if not is_resume else None,
+            documents=req_documents,
        ):
            if isinstance(chunk, CompletionMessage):
                output_message = chunk
@ -295,20 +308,23 @@ class ChatAgent(ShieldRunnerMixin):

            assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
            event = chunk.event
-            if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
-                steps.append(event.payload.step_details)
+            if event.payload.event_type == AgentTurnResponseEventType.step_complete.value and hasattr(
+                event.payload, "step_details"
+            ):
+                step_details = event.payload.step_details
+                steps.append(step_details)

            yield chunk

        assert output_message is not None

        turn = Turn(
-            turn_id=turn_id,
+            turn_id=actual_turn_id,
            session_id=request.session_id,
-            input_messages=input_messages,
+            input_messages=input_messages,  # type: ignore[arg-type]
            output_message=output_message,
            started_at=start_time,
-            completed_at=datetime.now(UTC).isoformat(),
+            completed_at=datetime.now(UTC),
            steps=steps,
        )
        await self.storage.add_turn_to_session(request.session_id, turn)
@ -345,9 +361,9 @@ class ChatAgent(ShieldRunnerMixin):
        # return a "final value" for the `yield from` statement. we simulate that by yielding a
        # final boolean (to see whether an exception happened) and then explicitly testing for it.

-        if len(self.input_shields) > 0:
+        if self.input_shields:
            async for res in self.run_multiple_shields_wrapper(
-                turn_id, input_messages, self.input_shields, "user-input"
+                turn_id, cast(list[OpenAIMessageParam], input_messages), self.input_shields, "user-input"
            ):
                if isinstance(res, bool):
                    return
@ -374,9 +390,9 @@ class ChatAgent(ShieldRunnerMixin):
        # for output shields run on the full input and output combination
        messages = input_messages + [final_response]

-        if len(self.output_shields) > 0:
+        if self.output_shields:
            async for res in self.run_multiple_shields_wrapper(
-                turn_id, messages, self.output_shields, "assistant-output"
+                turn_id, cast(list[OpenAIMessageParam], messages), self.output_shields, "assistant-output"
            ):
                if isinstance(res, bool):
                    return
@ -388,7 +404,7 @@ class ChatAgent(ShieldRunnerMixin):
    async def run_multiple_shields_wrapper(
        self,
        turn_id: str,
-        messages: list[Message],
+        messages: list[OpenAIMessageParam],
        shields: list[str],
        touchpoint: str,
    ) -> AsyncGenerator:
@ -402,12 +418,12 @@ class ChatAgent(ShieldRunnerMixin):
                return

            step_id = str(uuid.uuid4())
-            shield_call_start_time = datetime.now(UTC).isoformat()
+            shield_call_start_time = datetime.now(UTC)
            try:
                yield AgentTurnResponseStreamChunk(
                    event=AgentTurnResponseEvent(
                        payload=AgentTurnResponseStepStartPayload(
-                            step_type=StepType.shield_call.value,
+                            step_type=StepType.shield_call,
                            step_id=step_id,
                            metadata=dict(touchpoint=touchpoint),
                        )
@ -419,14 +435,14 @@ class ChatAgent(ShieldRunnerMixin):
                yield AgentTurnResponseStreamChunk(
                    event=AgentTurnResponseEvent(
                        payload=AgentTurnResponseStepCompletePayload(
-                            step_type=StepType.shield_call.value,
+                            step_type=StepType.shield_call,
                            step_id=step_id,
                            step_details=ShieldCallStep(
                                step_id=step_id,
                                turn_id=turn_id,
                                violation=e.violation,
                                started_at=shield_call_start_time,
-                                completed_at=datetime.now(UTC).isoformat(),
+                                completed_at=datetime.now(UTC),
                            ),
                        )
                    )
@ -443,14 +459,14 @@ class ChatAgent(ShieldRunnerMixin):
            yield AgentTurnResponseStreamChunk(
                event=AgentTurnResponseEvent(
                    payload=AgentTurnResponseStepCompletePayload(
-                        step_type=StepType.shield_call.value,
+                        step_type=StepType.shield_call,
                        step_id=step_id,
                        step_details=ShieldCallStep(
                            step_id=step_id,
                            turn_id=turn_id,
                            violation=None,
                            started_at=shield_call_start_time,
-                            completed_at=datetime.now(UTC).isoformat(),
+                            completed_at=datetime.now(UTC),
                        ),
                    )
                )
@ -496,21 +512,22 @@ class ChatAgent(ShieldRunnerMixin):
                    else:
                        self.tool_name_to_args[tool_name]["vector_store_ids"].append(session_info.vector_store_id)

-        output_attachments = []
+        output_attachments: list[Attachment] = []

        n_iter = await self.storage.get_num_infer_iters_in_turn(session_id, turn_id) or 0

        # Build a map of custom tools to their definitions for faster lookup
        client_tools = {}
-        for tool in self.agent_config.client_tools:
-            client_tools[tool.name] = tool
+        if self.agent_config.client_tools:
+            for tool in self.agent_config.client_tools:
+                client_tools[tool.name] = tool
        while True:
            step_id = str(uuid.uuid4())
-            inference_start_time = datetime.now(UTC).isoformat()
+            inference_start_time = datetime.now(UTC)
            yield AgentTurnResponseStreamChunk(
                event=AgentTurnResponseEvent(
                    payload=AgentTurnResponseStepStartPayload(
-                        step_type=StepType.inference.value,
+                        step_type=StepType.inference,
                        step_id=step_id,
                    )
                )
@ -538,7 +555,7 @@ class ChatAgent(ShieldRunnerMixin):
                    else:
                        return value

-                def _add_type(openai_msg: dict) -> OpenAIMessageParam:
+                def _add_type(openai_msg: Any) -> OpenAIMessageParam:
                    # Serialize any nested Pydantic models to plain dicts
                    openai_msg = _serialize_nested(openai_msg)

@ -588,7 +605,7 @@ class ChatAgent(ShieldRunnerMixin):
                    messages=openai_messages,
                    tools=openai_tools if openai_tools else None,
                    tool_choice=tool_choice,
-                    response_format=self.agent_config.response_format,
+                    response_format=self.agent_config.response_format,  # type: ignore[arg-type]
                    temperature=temperature,
                    top_p=top_p,
                    max_tokens=max_tokens,
@ -598,7 +615,8 @@ class ChatAgent(ShieldRunnerMixin):

                # Convert OpenAI stream back to Llama Stack format
                response_stream = convert_openai_chat_completion_stream(
-                    openai_stream, enable_incremental_tool_calls=True
+                    openai_stream,  # type: ignore[arg-type]
+                    enable_incremental_tool_calls=True,
                )

                async for chunk in response_stream:
@ -620,7 +638,7 @@ class ChatAgent(ShieldRunnerMixin):
                            yield AgentTurnResponseStreamChunk(
                                event=AgentTurnResponseEvent(
                                    payload=AgentTurnResponseStepProgressPayload(
-                                        step_type=StepType.inference.value,
+                                        step_type=StepType.inference,
                                        step_id=step_id,
                                        delta=delta,
                                    )
@ -633,7 +651,7 @@ class ChatAgent(ShieldRunnerMixin):
                            yield AgentTurnResponseStreamChunk(
                                event=AgentTurnResponseEvent(
                                    payload=AgentTurnResponseStepProgressPayload(
-                                        step_type=StepType.inference.value,
+                                        step_type=StepType.inference,
                                        step_id=step_id,
                                        delta=delta,
                                    )
@ -651,7 +669,9 @@ class ChatAgent(ShieldRunnerMixin):
                    output_attr = json.dumps(
                        {
                            "content": content,
-                            "tool_calls": [json.loads(t.model_dump_json()) for t in tool_calls],
+                            "tool_calls": [
+                                json.loads(t.model_dump_json()) for t in tool_calls if isinstance(t, ToolCall)
+                            ],
                        }
                    )
                    span.set_attribute("output", output_attr)
@ -667,16 +687,18 @@ class ChatAgent(ShieldRunnerMixin):
            if tool_calls:
                content = ""

+            # Filter out string tool calls for CompletionMessage (only keep ToolCall objects)
+            valid_tool_calls = [t for t in tool_calls if isinstance(t, ToolCall)]
            message = CompletionMessage(
                content=content,
                stop_reason=stop_reason,
-                tool_calls=tool_calls,
+                tool_calls=valid_tool_calls if valid_tool_calls else None,
            )

            yield AgentTurnResponseStreamChunk(
                event=AgentTurnResponseEvent(
                    payload=AgentTurnResponseStepCompletePayload(
-                        step_type=StepType.inference.value,
+                        step_type=StepType.inference,
                        step_id=step_id,
                        step_details=InferenceStep(
                            # somewhere deep, we are re-assigning message or closing over some
@ -686,13 +708,14 @@ class ChatAgent(ShieldRunnerMixin):
                            turn_id=turn_id,
                            model_response=copy.deepcopy(message),
                            started_at=inference_start_time,
-                            completed_at=datetime.now(UTC).isoformat(),
+                            completed_at=datetime.now(UTC),
                        ),
                    )
                )
            )

-            if n_iter >= self.agent_config.max_infer_iters:
+            max_iters = self.agent_config.max_infer_iters if self.agent_config.max_infer_iters is not None else 10
+            if n_iter >= max_iters:
                logger.info(f"done with MAX iterations ({n_iter}), exiting.")
                # NOTE: mark end_of_turn to indicate to client that we are done with the turn
                # Do not continue the tool call loop after this point
@ -705,14 +728,16 @@ class ChatAgent(ShieldRunnerMixin):
                yield message
                break

-            if len(message.tool_calls) == 0:
+            if not message.tool_calls or len(message.tool_calls) == 0:
                if stop_reason == StopReason.end_of_turn:
                    # TODO: UPDATE RETURN TYPE TO SEND A TUPLE OF (MESSAGE, ATTACHMENTS)
                    if len(output_attachments) > 0:
                        if isinstance(message.content, list):
-                            message.content += output_attachments
+                            # List invariance - attachments are compatible at runtime
+                            message.content += output_attachments  # type: ignore[arg-type]
                        else:
-                            message.content = [message.content] + output_attachments
+                            # List invariance - attachments are compatible at runtime
+                            message.content = [message.content] + output_attachments  # type: ignore[assignment]
                    yield message
                else:
                    logger.debug(f"completion message with EOM (iter: {n_iter}): {str(message)}")
@ -725,11 +750,12 @@ class ChatAgent(ShieldRunnerMixin):
                non_client_tool_calls = []

                # Separate client and non-client tool calls
-                for tool_call in message.tool_calls:
-                    if tool_call.tool_name in client_tools:
-                        client_tool_calls.append(tool_call)
-                    else:
-                        non_client_tool_calls.append(tool_call)
+                if message.tool_calls:
+                    for tool_call in message.tool_calls:
+                        if tool_call.tool_name in client_tools:
+                            client_tool_calls.append(tool_call)
+                        else:
+                            non_client_tool_calls.append(tool_call)

                # Process non-client tool calls first
                for tool_call in non_client_tool_calls:
@ -737,7 +763,7 @@ class ChatAgent(ShieldRunnerMixin):
                    yield AgentTurnResponseStreamChunk(
                        event=AgentTurnResponseEvent(
                            payload=AgentTurnResponseStepStartPayload(
-                                step_type=StepType.tool_execution.value,
+                                step_type=StepType.tool_execution,
                                step_id=step_id,
                            )
                        )
@ -746,7 +772,7 @@ class ChatAgent(ShieldRunnerMixin):
                    yield AgentTurnResponseStreamChunk(
                        event=AgentTurnResponseEvent(
                            payload=AgentTurnResponseStepProgressPayload(
-                                step_type=StepType.tool_execution.value,
+                                step_type=StepType.tool_execution,
                                step_id=step_id,
                                delta=ToolCallDelta(
                                    parse_status=ToolCallParseStatus.in_progress,
@ -766,7 +792,7 @@ class ChatAgent(ShieldRunnerMixin):
                        if self.telemetry_enabled
                        else {},
                    ) as span:
-                        tool_execution_start_time = datetime.now(UTC).isoformat()
+                        tool_execution_start_time = datetime.now(UTC)
                        tool_result = await self.execute_tool_call_maybe(
                            session_id,
                            tool_call,
@ -796,14 +822,14 @@ class ChatAgent(ShieldRunnerMixin):
                                )
                            ],
                            started_at=tool_execution_start_time,
-                            completed_at=datetime.now(UTC).isoformat(),
+                            completed_at=datetime.now(UTC),
                        )

                        # Yield the step completion event
                        yield AgentTurnResponseStreamChunk(
                            event=AgentTurnResponseEvent(
                                payload=AgentTurnResponseStepCompletePayload(
-                                    step_type=StepType.tool_execution.value,
+                                    step_type=StepType.tool_execution,
                                    step_id=step_id,
                                    step_details=tool_execution_step,
                                )
@ -833,7 +859,7 @@ class ChatAgent(ShieldRunnerMixin):
                            turn_id=turn_id,
                            tool_calls=client_tool_calls,
                            tool_responses=[],
-                            started_at=datetime.now(UTC).isoformat(),
+                            started_at=datetime.now(UTC),
                        ),
                    )

@ -868,19 +894,20 @@ class ChatAgent(ShieldRunnerMixin):

        toolgroup_to_args = toolgroup_to_args or {}

-        tool_name_to_def = {}
+        tool_name_to_def: dict[str, ToolDefinition] = {}
        tool_name_to_args = {}

-        for tool_def in self.agent_config.client_tools:
-            if tool_name_to_def.get(tool_def.name, None):
-                raise ValueError(f"Tool {tool_def.name} already exists")
+        if self.agent_config.client_tools:
+            for tool_def in self.agent_config.client_tools:
+                if tool_name_to_def.get(tool_def.name, None):
+                    raise ValueError(f"Tool {tool_def.name} already exists")

-            # Use input_schema from ToolDef directly
-            tool_name_to_def[tool_def.name] = ToolDefinition(
-                tool_name=tool_def.name,
-                description=tool_def.description,
-                input_schema=tool_def.input_schema,
-            )
+                # Use input_schema from ToolDef directly
+                tool_name_to_def[tool_def.name] = ToolDefinition(
+                    tool_name=tool_def.name,
+                    description=tool_def.description,
+                    input_schema=tool_def.input_schema,
+                )
        for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups:
            toolgroup_name, input_tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
            tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name)
@ -908,15 +935,17 @@ class ChatAgent(ShieldRunnerMixin):
                    else:
                        identifier = None

-                if tool_name_to_def.get(identifier, None):
-                    raise ValueError(f"Tool {identifier} already exists")
                if identifier:
-                    tool_name_to_def[identifier] = ToolDefinition(
-                        tool_name=identifier,
+                    # Convert BuiltinTool to string for dictionary key
+                    identifier_str = identifier.value if isinstance(identifier, BuiltinTool) else identifier
+                    if tool_name_to_def.get(identifier_str, None):
+                        raise ValueError(f"Tool {identifier_str} already exists")
+                    tool_name_to_def[identifier_str] = ToolDefinition(
+                        tool_name=identifier_str,
                        description=tool_def.description,
                        input_schema=tool_def.input_schema,
                    )
-                    tool_name_to_args[identifier] = toolgroup_to_args.get(toolgroup_name, {})
+                    tool_name_to_args[identifier_str] = toolgroup_to_args.get(toolgroup_name, {})

        self.tool_defs, self.tool_name_to_args = (
            list(tool_name_to_def.values()),
@ -966,14 +995,17 @@ class ChatAgent(ShieldRunnerMixin):
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse arguments for tool call: {tool_call.arguments}") from e

-        result = await self.tool_runtime_api.invoke_tool(
-            tool_name=tool_name_str,
-            kwargs={
-                "session_id": session_id,
-                # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
-                **args,
-                **self.tool_name_to_args.get(tool_name_str, {}),
-            },
+        result = cast(
+            ToolInvocationResult,
+            await self.tool_runtime_api.invoke_tool(
+                tool_name=tool_name_str,
+                kwargs={
+                    "session_id": session_id,
+                    # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
+                    **args,
+                    **self.tool_name_to_args.get(tool_name_str, {}),
+                },
+            ),
        )
        logger.debug(f"tool call {tool_name_str} completed with result: {result}")
        return result
@ -1017,7 +1049,7 @@ def _interpret_content_as_attachment(
        snippet = match.group(1)
        data = json.loads(snippet)
        return Attachment(
-            url=URL(uri="file://" + data["filepath"]),
+            content=URL(uri="file://" + data["filepath"]),
            mime_type=data["mimetype"],
        )

--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -21,6 +21,7 @@ from llama_stack.apis.agents import (
    Document,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
+    OpenAIDeleteResponseObject,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
@ -141,7 +142,7 @@ class MetaReferenceAgentsImpl(Agents):
            persistence_store=(
                self.persistence_store if agent_info.enable_session_persistence else self.in_memory_store
            ),
-            created_at=agent_info.created_at,
+            created_at=agent_info.created_at.isoformat(),
            policy=self.policy,
            telemetry_enabled=self.telemetry_enabled,
        )
@ -163,9 +164,9 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_id: str,
        messages: list[UserMessage | ToolResponseMessage],
-        toolgroups: list[AgentToolGroup] | None = None,
-        documents: list[Document] | None = None,
        stream: bool | None = False,
+        documents: list[Document] | None = None,
+        toolgroups: list[AgentToolGroup] | None = None,
        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        request = AgentTurnCreateRequest(
@ -221,6 +222,8 @@ class MetaReferenceAgentsImpl(Agents):
    async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn:
        agent = await self._get_agent_impl(agent_id)
        turn = await agent.storage.get_session_turn(session_id, turn_id)
+        if turn is None:
+            raise ValueError(f"Turn {turn_id} not found in session {session_id}")
        return turn

    async def get_agents_step(self, agent_id: str, session_id: str, turn_id: str, step_id: str) -> AgentStepResponse:
@ -232,13 +235,15 @@ class MetaReferenceAgentsImpl(Agents):

    async def get_agents_session(
        self,
-        agent_id: str,
        session_id: str,
+        agent_id: str,
        turn_ids: list[str] | None = None,
    ) -> Session:
        agent = await self._get_agent_impl(agent_id)

        session_info = await agent.storage.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
        turns = await agent.storage.get_session_turns(session_id)
        if turn_ids:
            turns = [turn for turn in turns if turn.turn_id in turn_ids]
@ -249,7 +254,7 @@ class MetaReferenceAgentsImpl(Agents):
            started_at=session_info.started_at,
        )

-    async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
+    async def delete_agents_session(self, session_id: str, agent_id: str) -> None:
        agent = await self._get_agent_impl(agent_id)

        # Delete turns first, then the session
@ -302,7 +307,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent = Agent(
            agent_id=agent_id,
            agent_config=chat_agent.agent_config,
-            created_at=chat_agent.created_at,
+            created_at=datetime.fromisoformat(chat_agent.created_at),
        )
        return agent

@ -323,6 +328,7 @@ class MetaReferenceAgentsImpl(Agents):
        self,
        response_id: str,
    ) -> OpenAIResponseObject:
+        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        return await self.openai_responses_impl.get_openai_response(response_id)

    async def create_openai_response(
@ -342,7 +348,8 @@ class MetaReferenceAgentsImpl(Agents):
        max_infer_iters: int | None = 10,
        guardrails: list[ResponseGuardrail] | None = None,
    ) -> OpenAIResponseObject:
-        return await self.openai_responses_impl.create_openai_response(
+        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
+        result = await self.openai_responses_impl.create_openai_response(
            input,
            model,
            prompt,
@ -358,6 +365,7 @@ class MetaReferenceAgentsImpl(Agents):
            max_infer_iters,
            guardrails,
        )
+        return result  # type: ignore[no-any-return]

    async def list_openai_responses(
        self,
@ -366,6 +374,7 @@ class MetaReferenceAgentsImpl(Agents):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseObject:
+        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        return await self.openai_responses_impl.list_openai_responses(after, limit, model, order)

    async def list_openai_response_input_items(
@ -377,9 +386,11 @@ class MetaReferenceAgentsImpl(Agents):
        limit: int | None = 20,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseInputItem:
+        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        return await self.openai_responses_impl.list_openai_response_input_items(
            response_id, after, before, include, limit, order
        )

-    async def delete_openai_response(self, response_id: str) -> None:
+    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
+        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        return await self.openai_responses_impl.delete_openai_response(response_id)
--- a/src/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/persistence.py
@ -6,12 +6,14 @@

 import json
 import uuid
+from dataclasses import dataclass
 from datetime import UTC, datetime

 from llama_stack.apis.agents import AgentConfig, Session, ToolExecutionStep, Turn
 from llama_stack.apis.common.errors import SessionNotFoundError
 from llama_stack.core.access_control.access_control import AccessDeniedError, is_action_allowed
-from llama_stack.core.access_control.datatypes import AccessRule
+from llama_stack.core.access_control.conditions import User as ProtocolUser
+from llama_stack.core.access_control.datatypes import AccessRule, Action
 from llama_stack.core.datatypes import User
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.log import get_logger
@ -33,6 +35,15 @@ class AgentInfo(AgentConfig):
    created_at: datetime


+@dataclass
+class SessionResource:
+    """Concrete implementation of ProtectedResource for session access control."""
+
+    type: str
+    identifier: str
+    owner: ProtocolUser  # Use the protocol type for structural compatibility
+
+
 class AgentPersistence:
    def __init__(self, agent_id: str, kvstore: KVStore, policy: list[AccessRule]):
        self.agent_id = agent_id
@ -53,8 +64,15 @@ class AgentPersistence:
            turns=[],
            identifier=name,  # should this be qualified in any way?
        )
-        if not is_action_allowed(self.policy, "create", session_info, user):
-            raise AccessDeniedError("create", session_info, user)
+        # Only perform access control if we have an authenticated user
+        if user is not None and session_info.identifier is not None:
+            resource = SessionResource(
+                type=session_info.type,
+                identifier=session_info.identifier,
+                owner=user,
+            )
+            if not is_action_allowed(self.policy, Action.CREATE, resource, user):
+                raise AccessDeniedError(Action.CREATE, resource, user)

        await self.kvstore.set(
            key=f"session:{self.agent_id}:{session_id}",
@ -62,7 +80,7 @@ class AgentPersistence:
        )
        return session_id

-    async def get_session_info(self, session_id: str) -> AgentSessionInfo:
+    async def get_session_info(self, session_id: str) -> AgentSessionInfo | None:
        value = await self.kvstore.get(
            key=f"session:{self.agent_id}:{session_id}",
        )
@ -83,7 +101,22 @@ class AgentPersistence:
        if not hasattr(session_info, "access_attributes") and not hasattr(session_info, "owner"):
            return True

-        return is_action_allowed(self.policy, "read", session_info, get_authenticated_user())
+        # Get current user - if None, skip access control (e.g., in tests)
+        user = get_authenticated_user()
+        if user is None:
+            return True
+
+        # Access control requires identifier and owner to be set
+        if session_info.identifier is None or session_info.owner is None:
+            return True
+
+        # At this point, both identifier and owner are guaranteed to be non-None
+        resource = SessionResource(
+            type=session_info.type,
+            identifier=session_info.identifier,
+            owner=session_info.owner,
+        )
+        return is_action_allowed(self.policy, Action.READ, resource, user)

    async def get_session_if_accessible(self, session_id: str) -> AgentSessionInfo | None:
        """Get session info if the user has access to it. For internal use by sub-session methods."""
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -91,7 +91,8 @@ class OpenAIResponsesImpl:
        input: str | list[OpenAIResponseInput],
        previous_response: _OpenAIResponseObjectWithInputAndMessages,
    ):
-        new_input_items = previous_response.input.copy()
+        # Convert Sequence to list for mutation
+        new_input_items = list(previous_response.input)
        new_input_items.extend(previous_response.output)

        if isinstance(input, str):
@ -107,7 +108,7 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None,
        previous_response_id: str | None,
        conversation: str | None,
-    ) -> tuple[str | list[OpenAIResponseInput], list[OpenAIMessageParam]]:
+    ) -> tuple[str | list[OpenAIResponseInput], list[OpenAIMessageParam], ToolContext]:
        """Process input with optional previous response context.

        Returns:
@ -208,6 +209,9 @@ class OpenAIResponsesImpl:
        messages: list[OpenAIMessageParam],
    ) -> None:
        new_input_id = f"msg_{uuid.uuid4()}"
+        # Type input_items_data as the full OpenAIResponseInput union to avoid list invariance issues
+        input_items_data: list[OpenAIResponseInput] = []
+
        if isinstance(input, str):
            # synthesize a message from the input string
            input_content = OpenAIResponseInputMessageContentText(text=input)
@ -219,7 +223,6 @@ class OpenAIResponsesImpl:
            input_items_data = [input_content_item]
        else:
            # we already have a list of messages
-            input_items_data = []
            for input_item in input:
                if isinstance(input_item, OpenAIResponseMessage):
                    # These may or may not already have an id, so dump to dict, check for id, and add if missing
@ -251,7 +254,7 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
-        guardrails: list[ResponseGuardrailSpec] | None = None,
+        guardrails: list[str | ResponseGuardrailSpec] | None = None,
    ):
        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@ -289,16 +292,19 @@ class OpenAIResponsesImpl:
            failed_response = None

            async for stream_chunk in stream_gen:
-                if stream_chunk.type in {"response.completed", "response.incomplete"}:
-                    if final_response is not None:
-                        raise ValueError(
-                            "The response stream produced multiple terminal responses! "
-                            f"Earlier response from {final_event_type}"
-                        )
-                    final_response = stream_chunk.response
-                    final_event_type = stream_chunk.type
-                elif stream_chunk.type == "response.failed":
-                    failed_response = stream_chunk.response
+                match stream_chunk.type:
+                    case "response.completed" | "response.incomplete":
+                        if final_response is not None:
+                            raise ValueError(
+                                "The response stream produced multiple terminal responses! "
+                                f"Earlier response from {final_event_type}"
+                            )
+                        final_response = stream_chunk.response
+                        final_event_type = stream_chunk.type
+                    case "response.failed":
+                        failed_response = stream_chunk.response
+                    case _:
+                        pass  # Other event types don't have .response

            if failed_response is not None:
                error_message = (
@ -326,6 +332,11 @@ class OpenAIResponsesImpl:
        max_infer_iters: int | None = 10,
        guardrail_ids: list[str] | None = None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
+        # These should never be None when called from create_openai_response (which sets defaults)
+        # but we assert here to help mypy understand the types
+        assert text is not None, "text must not be None"
+        assert max_infer_iters is not None, "max_infer_iters must not be None"
+
        # Input preprocessing
        all_input, messages, tool_context = await self._process_input_with_previous_response(
            input, tools, previous_response_id, conversation
@ -368,16 +379,19 @@ class OpenAIResponsesImpl:
        final_response = None
        failed_response = None

-        output_items = []
+        # Type as ConversationItem to avoid list invariance issues
+        output_items: list[ConversationItem] = []
        async for stream_chunk in orchestrator.create_response():
-            if stream_chunk.type in {"response.completed", "response.incomplete"}:
-                final_response = stream_chunk.response
-            elif stream_chunk.type == "response.failed":
-                failed_response = stream_chunk.response
-
-            if stream_chunk.type == "response.output_item.done":
-                item = stream_chunk.item
-                output_items.append(item)
+            match stream_chunk.type:
+                case "response.completed" | "response.incomplete":
+                    final_response = stream_chunk.response
+                case "response.failed":
+                    failed_response = stream_chunk.response
+                case "response.output_item.done":
+                    item = stream_chunk.item
+                    output_items.append(item)
+                case _:
+                    pass  # Other event types

            # Store and sync before yielding terminal events
            # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
@ -410,7 +424,8 @@ class OpenAIResponsesImpl:
        self, conversation_id: str, input: str | list[OpenAIResponseInput] | None, output_items: list[ConversationItem]
    ) -> None:
        """Sync content and response messages to the conversation."""
-        conversation_items = []
+        # Type as ConversationItem union to avoid list invariance issues
+        conversation_items: list[ConversationItem] = []

        if isinstance(input, str):
            conversation_items.append(
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -111,7 +111,7 @@ class StreamingResponseOrchestrator:
        text: OpenAIResponseText,
        max_infer_iters: int,
        tool_executor,  # Will be the tool execution logic from the main class
-        instructions: str,
+        instructions: str | None,
        safety_api,
        guardrail_ids: list[str] | None = None,
        prompt: OpenAIResponsePrompt | None = None,
@ -128,7 +128,9 @@ class StreamingResponseOrchestrator:
        self.prompt = prompt
        self.sequence_number = 0
        # Store MCP tool mapping that gets built during tool processing
-        self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = ctx.tool_context.previous_tools or {}
+        self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
+            ctx.tool_context.previous_tools if ctx.tool_context else {}
+        )
        # Track final messages after all tool executions
        self.final_messages: list[OpenAIMessageParam] = []
        # mapping for annotations
@ -229,7 +231,8 @@ class StreamingResponseOrchestrator:
                params = OpenAIChatCompletionRequestWithExtraBody(
                    model=self.ctx.model,
                    messages=messages,
-                    tools=self.ctx.chat_tools,
+                    # Pydantic models are dict-compatible but mypy treats them as distinct types
+                    tools=self.ctx.chat_tools,  # type: ignore[arg-type]
                    stream=True,
                    temperature=self.ctx.temperature,
                    response_format=response_format,
@ -272,7 +275,12 @@ class StreamingResponseOrchestrator:

                # Handle choices with no tool calls
                for choice in current_response.choices:
-                    if not (choice.message.tool_calls and self.ctx.response_tools):
+                    has_tool_calls = (
+                        isinstance(choice.message, OpenAIAssistantMessageParam)
+                        and choice.message.tool_calls
+                        and self.ctx.response_tools
+                    )
+                    if not has_tool_calls:
                        output_messages.append(
                            await convert_chat_choice_to_response_message(
                                choice,
@ -722,7 +730,10 @@ class StreamingResponseOrchestrator:
                                )

                            # Accumulate arguments for final response (only for subsequent chunks)
-                            if not is_new_tool_call:
+                            if not is_new_tool_call and response_tool_call is not None:
+                                # Both should have functions since we're inside the tool_call.function check above
+                                assert response_tool_call.function is not None
+                                assert tool_call.function is not None
                                response_tool_call.function.arguments = (
                                    response_tool_call.function.arguments or ""
                                ) + tool_call.function.arguments
@ -747,10 +758,13 @@ class StreamingResponseOrchestrator:
        for tool_call_index in sorted(chat_response_tool_calls.keys()):
            tool_call = chat_response_tool_calls[tool_call_index]
            # Ensure that arguments, if sent back to the inference provider, are not None
-            tool_call.function.arguments = tool_call.function.arguments or "{}"
+            if tool_call.function:
+                tool_call.function.arguments = tool_call.function.arguments or "{}"
            tool_call_item_id = tool_call_item_ids[tool_call_index]
-            final_arguments = tool_call.function.arguments
-            tool_call_name = chat_response_tool_calls[tool_call_index].function.name
+            final_arguments: str = tool_call.function.arguments or "{}" if tool_call.function else "{}"
+            func = chat_response_tool_calls[tool_call_index].function
+
+            tool_call_name = func.name if func else ""

            # Check if this is an MCP tool call
            is_mcp_tool = tool_call_name and tool_call_name in self.mcp_tool_to_server
@ -894,12 +908,11 @@ class StreamingResponseOrchestrator:

            self.sequence_number += 1
            if tool_call.function.name and tool_call.function.name in self.mcp_tool_to_server:
-                item = OpenAIResponseOutputMessageMCPCall(
+                item: OpenAIResponseOutput = OpenAIResponseOutputMessageMCPCall(
                    arguments="",
                    name=tool_call.function.name,
                    id=matching_item_id,
                    server_label=self.mcp_tool_to_server[tool_call.function.name].server_label,
-                    status="in_progress",
                )
            elif tool_call.function.name == "web_search":
                item = OpenAIResponseOutputMessageWebSearchToolCall(
@ -1008,7 +1021,7 @@ class StreamingResponseOrchestrator:
                description=tool.description,
                input_schema=tool.input_schema,
            )
-            return convert_tooldef_to_openai_tool(tool_def)
+            return convert_tooldef_to_openai_tool(tool_def)  # type: ignore[return-value]  # Returns dict but ChatCompletionToolParam expects TypedDict

        # Initialize chat_tools if not already set
        if self.ctx.chat_tools is None:
@ -1016,7 +1029,7 @@ class StreamingResponseOrchestrator:

        for input_tool in tools:
            if input_tool.type == "function":
-                self.ctx.chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))
+                self.ctx.chat_tools.append(ChatCompletionToolParam(type="function", function=input_tool.model_dump()))  # type: ignore[typeddict-item,arg-type]  # Dict compatible with FunctionDefinition
            elif input_tool.type in WebSearchToolTypes:
                tool_name = "web_search"
                # Need to access tool_groups_api from tool_executor
@ -1055,8 +1068,8 @@ class StreamingResponseOrchestrator:
                if isinstance(mcp_tool.allowed_tools, list):
                    always_allowed = mcp_tool.allowed_tools
                elif isinstance(mcp_tool.allowed_tools, AllowedToolsFilter):
-                    always_allowed = mcp_tool.allowed_tools.always
-                    never_allowed = mcp_tool.allowed_tools.never
+                    # AllowedToolsFilter only has tool_names field (not allowed/disallowed)
+                    always_allowed = mcp_tool.allowed_tools.tool_names

            # Call list_mcp_tools
            tool_defs = None
@ -1088,7 +1101,7 @@ class StreamingResponseOrchestrator:
                    openai_tool = convert_tooldef_to_chat_tool(t)
                    if self.ctx.chat_tools is None:
                        self.ctx.chat_tools = []
-                    self.ctx.chat_tools.append(openai_tool)
+                    self.ctx.chat_tools.append(openai_tool)  # type: ignore[arg-type]  # Returns dict but ChatCompletionToolParam expects TypedDict

                    # Add to MCP tool mapping
                    if t.name in self.mcp_tool_to_server:
@ -1120,13 +1133,17 @@ class StreamingResponseOrchestrator:
        self, output_messages: list[OpenAIResponseOutput]
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # Handle all mcp tool lists from previous response that are still valid:
-        for tool in self.ctx.tool_context.previous_tool_listings:
-            async for evt in self._reuse_mcp_list_tools(tool, output_messages):
-                yield evt
-        # Process all remaining tools (including MCP tools) and emit streaming events
-        if self.ctx.tool_context.tools_to_process:
-            async for stream_event in self._process_new_tools(self.ctx.tool_context.tools_to_process, output_messages):
-                yield stream_event
+        # tool_context can be None when no tools are provided in the response request
+        if self.ctx.tool_context:
+            for tool in self.ctx.tool_context.previous_tool_listings:
+                async for evt in self._reuse_mcp_list_tools(tool, output_messages):
+                    yield evt
+            # Process all remaining tools (including MCP tools) and emit streaming events
+            if self.ctx.tool_context.tools_to_process:
+                async for stream_event in self._process_new_tools(
+                    self.ctx.tool_context.tools_to_process, output_messages
+                ):
+                    yield stream_event

    def _approval_required(self, tool_name: str) -> bool:
        if tool_name not in self.mcp_tool_to_server:
@ -1220,7 +1237,7 @@ class StreamingResponseOrchestrator:
            openai_tool = convert_tooldef_to_openai_tool(tool_def)
            if self.ctx.chat_tools is None:
                self.ctx.chat_tools = []
-            self.ctx.chat_tools.append(openai_tool)
+            self.ctx.chat_tools.append(openai_tool)  # type: ignore[arg-type]  # Returns dict but ChatCompletionToolParam expects TypedDict

        mcp_list_message = OpenAIResponseOutputMessageMCPListTools(
            id=f"mcp_list_{uuid.uuid4()}",
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -7,6 +7,7 @@
 import asyncio
 import json
 from collections.abc import AsyncIterator
+from typing import Any

 from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseInputToolFileSearch,
@ -22,6 +23,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseWebSearchCallSearching,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFileSearchToolCallResults,
+    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.common.content_types import (
@ -67,7 +69,7 @@ class ToolExecutor:
    ) -> AsyncIterator[ToolExecutionResult]:
        tool_call_id = tool_call.id
        function = tool_call.function
-        tool_kwargs = json.loads(function.arguments) if function.arguments else {}
+        tool_kwargs = json.loads(function.arguments) if function and function.arguments else {}

        if not function or not tool_call_id or not function.name:
            yield ToolExecutionResult(sequence_number=sequence_number)
@ -84,7 +86,16 @@ class ToolExecutor:
        error_exc, result = await self._execute_tool(function.name, tool_kwargs, ctx, mcp_tool_to_server)

        # Emit completion events for tool execution
-        has_error = error_exc or (result and ((result.error_code and result.error_code > 0) or result.error_message))
+        has_error = bool(
+            error_exc
+            or (
+                result
+                and (
+                    ((error_code := getattr(result, "error_code", None)) and error_code > 0)
+                    or getattr(result, "error_message", None)
+                )
+            )
+        )
        async for event_result in self._emit_completion_events(
            function.name, ctx, sequence_number, output_index, item_id, has_error, mcp_tool_to_server
        ):
@ -101,7 +112,9 @@ class ToolExecutor:
            sequence_number=sequence_number,
            final_output_message=output_message,
            final_input_message=input_message,
-            citation_files=result.metadata.get("citation_files") if result and result.metadata else None,
+            citation_files=(
+                metadata.get("citation_files") if result and (metadata := getattr(result, "metadata", None)) else None
+            ),
        )

    async def _execute_knowledge_search_via_vector_store(
@ -188,8 +201,9 @@ class ToolExecutor:

            citation_files[file_id] = filename

+        # Cast to proper InterleavedContent type (list invariance)
        return ToolInvocationResult(
-            content=content_items,
+            content=content_items,  # type: ignore[arg-type]
            metadata={
                "document_ids": [r.file_id for r in search_results],
                "chunks": [r.content[0].text if r.content else "" for r in search_results],
@ -209,51 +223,60 @@ class ToolExecutor:
    ) -> AsyncIterator[ToolExecutionResult]:
        """Emit progress events for tool execution start."""
        # Emit in_progress event based on tool type (only for tools with specific streaming events)
-        progress_event = None
        if mcp_tool_to_server and function_name in mcp_tool_to_server:
            sequence_number += 1
-            progress_event = OpenAIResponseObjectStreamResponseMcpCallInProgress(
-                item_id=item_id,
-                output_index=output_index,
+            yield ToolExecutionResult(
+                stream_event=OpenAIResponseObjectStreamResponseMcpCallInProgress(
+                    item_id=item_id,
+                    output_index=output_index,
+                    sequence_number=sequence_number,
+                ),
                sequence_number=sequence_number,
            )
        elif function_name == "web_search":
            sequence_number += 1
-            progress_event = OpenAIResponseObjectStreamResponseWebSearchCallInProgress(
-                item_id=item_id,
-                output_index=output_index,
+            yield ToolExecutionResult(
+                stream_event=OpenAIResponseObjectStreamResponseWebSearchCallInProgress(
+                    item_id=item_id,
+                    output_index=output_index,
+                    sequence_number=sequence_number,
+                ),
                sequence_number=sequence_number,
            )
        elif function_name == "knowledge_search":
            sequence_number += 1
-            progress_event = OpenAIResponseObjectStreamResponseFileSearchCallInProgress(
-                item_id=item_id,
-                output_index=output_index,
+            yield ToolExecutionResult(
+                stream_event=OpenAIResponseObjectStreamResponseFileSearchCallInProgress(
+                    item_id=item_id,
+                    output_index=output_index,
+                    sequence_number=sequence_number,
+                ),
                sequence_number=sequence_number,
            )

-        if progress_event:
-            yield ToolExecutionResult(stream_event=progress_event, sequence_number=sequence_number)
-
        # For web search, emit searching event
        if function_name == "web_search":
            sequence_number += 1
-            searching_event = OpenAIResponseObjectStreamResponseWebSearchCallSearching(
-                item_id=item_id,
-                output_index=output_index,
+            yield ToolExecutionResult(
+                stream_event=OpenAIResponseObjectStreamResponseWebSearchCallSearching(
+                    item_id=item_id,
+                    output_index=output_index,
+                    sequence_number=sequence_number,
+                ),
                sequence_number=sequence_number,
            )
-            yield ToolExecutionResult(stream_event=searching_event, sequence_number=sequence_number)

        # For file search, emit searching event
        if function_name == "knowledge_search":
            sequence_number += 1
-            searching_event = OpenAIResponseObjectStreamResponseFileSearchCallSearching(
-                item_id=item_id,
-                output_index=output_index,
+            yield ToolExecutionResult(
+                stream_event=OpenAIResponseObjectStreamResponseFileSearchCallSearching(
+                    item_id=item_id,
+                    output_index=output_index,
+                    sequence_number=sequence_number,
+                ),
                sequence_number=sequence_number,
            )
-            yield ToolExecutionResult(stream_event=searching_event, sequence_number=sequence_number)

    async def _execute_tool(
        self,
@ -261,7 +284,7 @@ class ToolExecutor:
        tool_kwargs: dict,
        ctx: ChatCompletionContext,
        mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] | None = None,
-    ) -> tuple[Exception | None, any]:
+    ) -> tuple[Exception | None, Any]:
        """Execute the tool and return error exception and result."""
        error_exc = None
        result = None
@ -284,9 +307,13 @@ class ToolExecutor:
                        kwargs=tool_kwargs,
                    )
            elif function_name == "knowledge_search":
-                response_file_search_tool = next(
-                    (t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)),
-                    None,
+                response_file_search_tool = (
+                    next(
+                        (t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)),
+                        None,
+                    )
+                    if ctx.response_tools
+                    else None
                )
                if response_file_search_tool:
                    # Use vector_stores.search API instead of knowledge_search tool
@ -322,35 +349,34 @@ class ToolExecutor:
        mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] | None = None,
    ) -> AsyncIterator[ToolExecutionResult]:
        """Emit completion or failure events for tool execution."""
-        completion_event = None
-
        if mcp_tool_to_server and function_name in mcp_tool_to_server:
            sequence_number += 1
            if has_error:
-                completion_event = OpenAIResponseObjectStreamResponseMcpCallFailed(
+                mcp_failed_event = OpenAIResponseObjectStreamResponseMcpCallFailed(
                    sequence_number=sequence_number,
                )
+                yield ToolExecutionResult(stream_event=mcp_failed_event, sequence_number=sequence_number)
            else:
-                completion_event = OpenAIResponseObjectStreamResponseMcpCallCompleted(
+                mcp_completed_event = OpenAIResponseObjectStreamResponseMcpCallCompleted(
                    sequence_number=sequence_number,
                )
+                yield ToolExecutionResult(stream_event=mcp_completed_event, sequence_number=sequence_number)
        elif function_name == "web_search":
            sequence_number += 1
-            completion_event = OpenAIResponseObjectStreamResponseWebSearchCallCompleted(
+            web_completion_event = OpenAIResponseObjectStreamResponseWebSearchCallCompleted(
                item_id=item_id,
                output_index=output_index,
                sequence_number=sequence_number,
            )
+            yield ToolExecutionResult(stream_event=web_completion_event, sequence_number=sequence_number)
        elif function_name == "knowledge_search":
            sequence_number += 1
-            completion_event = OpenAIResponseObjectStreamResponseFileSearchCallCompleted(
+            file_completion_event = OpenAIResponseObjectStreamResponseFileSearchCallCompleted(
                item_id=item_id,
                output_index=output_index,
                sequence_number=sequence_number,
            )
-
-        if completion_event:
-            yield ToolExecutionResult(stream_event=completion_event, sequence_number=sequence_number)
+            yield ToolExecutionResult(stream_event=file_completion_event, sequence_number=sequence_number)

    async def _build_result_messages(
        self,
@ -360,21 +386,18 @@ class ToolExecutor:
        tool_kwargs: dict,
        ctx: ChatCompletionContext,
        error_exc: Exception | None,
-        result: any,
+        result: Any,
        has_error: bool,
        mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] | None = None,
-    ) -> tuple[any, any]:
+    ) -> tuple[Any, Any]:
        """Build output and input messages from tool execution results."""
        from llama_stack.providers.utils.inference.prompt_adapter import (
            interleaved_content_as_str,
        )

        # Build output message
+        message: Any
        if mcp_tool_to_server and function.name in mcp_tool_to_server:
-            from llama_stack.apis.agents.openai_responses import (
-                OpenAIResponseOutputMessageMCPCall,
-            )
-
            message = OpenAIResponseOutputMessageMCPCall(
                id=item_id,
                arguments=function.arguments,
@ -383,10 +406,14 @@ class ToolExecutor:
            )
            if error_exc:
                message.error = str(error_exc)
-            elif (result and result.error_code and result.error_code > 0) or (result and result.error_message):
-                message.error = f"Error (code {result.error_code}): {result.error_message}"
-            elif result and result.content:
-                message.output = interleaved_content_as_str(result.content)
+            elif (result and (error_code := getattr(result, "error_code", None)) and error_code > 0) or (
+                result and getattr(result, "error_message", None)
+            ):
+                ec = getattr(result, "error_code", "unknown")
+                em = getattr(result, "error_message", "")
+                message.error = f"Error (code {ec}): {em}"
+            elif result and (content := getattr(result, "content", None)):
+                message.output = interleaved_content_as_str(content)
        else:
            if function.name == "web_search":
                message = OpenAIResponseOutputMessageWebSearchToolCall(
@ -401,17 +428,17 @@ class ToolExecutor:
                    queries=[tool_kwargs.get("query", "")],
                    status="completed",
                )
-                if result and "document_ids" in result.metadata:
+                if result and (metadata := getattr(result, "metadata", None)) and "document_ids" in metadata:
                    message.results = []
-                    for i, doc_id in enumerate(result.metadata["document_ids"]):
-                        text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
-                        score = result.metadata["scores"][i] if "scores" in result.metadata else None
+                    for i, doc_id in enumerate(metadata["document_ids"]):
+                        text = metadata["chunks"][i] if "chunks" in metadata else None
+                        score = metadata["scores"][i] if "scores" in metadata else None
                        message.results.append(
                            OpenAIResponseOutputMessageFileSearchToolCallResults(
                                file_id=doc_id,
                                filename=doc_id,
-                                text=text,
-                                score=score,
+                                text=text if text is not None else "",
+                                score=score if score is not None else 0.0,
                                attributes={},
                            )
                        )
@ -421,27 +448,32 @@ class ToolExecutor:
                raise ValueError(f"Unknown tool {function.name} called")

        # Build input message
-        input_message = None
-        if result and result.content:
-            if isinstance(result.content, str):
-                content = result.content
-            elif isinstance(result.content, list):
-                content = []
-                for item in result.content:
+        input_message: OpenAIToolMessageParam | None = None
+        if result and (result_content := getattr(result, "content", None)):
+            # all the mypy contortions here are still unsatisfactory with random Any typing
+            if isinstance(result_content, str):
+                msg_content: str | list[Any] = result_content
+            elif isinstance(result_content, list):
+                content_list: list[Any] = []
+                for item in result_content:
+                    part: Any
                    if isinstance(item, TextContentItem):
                        part = OpenAIChatCompletionContentPartTextParam(text=item.text)
                    elif isinstance(item, ImageContentItem):
                        if item.image.data:
-                            url = f"data:image;base64,{item.image.data}"
+                            url_value = f"data:image;base64,{item.image.data}"
                        else:
-                            url = item.image.url
-                        part = OpenAIChatCompletionContentPartImageParam(image_url=OpenAIImageURL(url=url))
+                            url_value = str(item.image.url) if item.image.url else ""
+                        part = OpenAIChatCompletionContentPartImageParam(image_url=OpenAIImageURL(url=url_value))
                    else:
                        raise ValueError(f"Unknown result content type: {type(item)}")
-                    content.append(part)
+                    content_list.append(part)
+                msg_content = content_list
            else:
-                raise ValueError(f"Unknown result content type: {type(result.content)}")
-            input_message = OpenAIToolMessageParam(content=content, tool_call_id=tool_call_id)
+                raise ValueError(f"Unknown result content type: {type(result_content)}")
+            # OpenAIToolMessageParam accepts str | list[TextParam] but we may have images
+            # This is runtime-safe as the API accepts it, but mypy complains
+            input_message = OpenAIToolMessageParam(content=msg_content, tool_call_id=tool_call_id)  # type: ignore[arg-type]
        else:
            text = str(error_exc) if error_exc else "Tool execution failed"
            input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id)
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 from dataclasses import dataclass
+from typing import cast

 from openai.types.chat import ChatCompletionToolParam
 from pydantic import BaseModel
@ -100,17 +101,19 @@ class ToolContext(BaseModel):
                if isinstance(tool, OpenAIResponseToolMCP):
                    previous_tools_by_label[tool.server_label] = tool
            # collect tool definitions which are the same in current and previous requests:
-            tools_to_process = []
+            tools_to_process: list[OpenAIResponseInputTool] = []
            matched: dict[str, OpenAIResponseInputToolMCP] = {}
-            for tool in self.current_tools:
+            # Mypy confuses OpenAIResponseInputTool (Input union) with OpenAIResponseTool (output union)
+            # which differ only in MCP type (InputToolMCP vs ToolMCP). Code is correct.
+            for tool in cast(list[OpenAIResponseInputTool], self.current_tools):  # type: ignore[assignment]
                if isinstance(tool, OpenAIResponseInputToolMCP) and tool.server_label in previous_tools_by_label:
                    previous_tool = previous_tools_by_label[tool.server_label]
                    if previous_tool.allowed_tools == tool.allowed_tools:
                        matched[tool.server_label] = tool
                    else:
-                        tools_to_process.append(tool)
+                        tools_to_process.append(tool)  # type: ignore[arg-type]
                else:
-                    tools_to_process.append(tool)
+                    tools_to_process.append(tool)  # type: ignore[arg-type]
            # tools that are not the same or were not previously defined need to be processed:
            self.tools_to_process = tools_to_process
            # for all matched definitions, get the mcp_list_tools objects from the previous output:
@ -119,9 +122,11 @@ class ToolContext(BaseModel):
            ]
            # reconstruct the tool to server mappings that can be reused:
            for listing in self.previous_tool_listings:
+                # listing is OpenAIResponseOutputMessageMCPListTools which has tools: list[MCPListToolsTool]
                definition = matched[listing.server_label]
-                for tool in listing.tools:
-                    self.previous_tools[tool.name] = definition
+                for mcp_tool in listing.tools:
+                    # mcp_tool is MCPListToolsTool which has a name: str field
+                    self.previous_tools[mcp_tool.name] = definition

    def available_tools(self) -> list[OpenAIResponseTool]:
        if not self.current_tools:
@ -139,6 +144,8 @@ class ToolContext(BaseModel):
                    server_label=tool.server_label,
                    allowed_tools=tool.allowed_tools,
                )
+            # Exhaustive check - all tool types should be handled above
+            raise AssertionError(f"Unexpected tool type: {type(tool)}")

        return [convert_tool(tool) for tool in self.current_tools]

--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@ -7,6 +7,7 @@
 import asyncio
 import re
 import uuid
+from collections.abc import Sequence

 from llama_stack.apis.agents.agents import ResponseGuardrailSpec
 from llama_stack.apis.agents.openai_responses import (
@ -71,14 +72,14 @@ async def convert_chat_choice_to_response_message(

    return OpenAIResponseMessage(
        id=message_id or f"msg_{uuid.uuid4()}",
-        content=[OpenAIResponseOutputMessageContentOutputText(text=clean_text, annotations=annotations)],
+        content=[OpenAIResponseOutputMessageContentOutputText(text=clean_text, annotations=list(annotations))],
        status="completed",
        role="assistant",
    )


 async def convert_response_content_to_chat_content(
-    content: (str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]),
+    content: str | Sequence[OpenAIResponseInputMessageContent | OpenAIResponseOutputMessageContent],
 ) -> str | list[OpenAIChatCompletionContentPartParam]:
    """
    Convert the content parts from an OpenAI Response API request into OpenAI Chat Completion content parts.
@ -88,7 +89,8 @@ async def convert_response_content_to_chat_content(
    if isinstance(content, str):
        return content

-    converted_parts = []
+    # Type with union to avoid list invariance issues
+    converted_parts: list[OpenAIChatCompletionContentPartParam] = []
    for content_part in content:
        if isinstance(content_part, OpenAIResponseInputMessageContentText):
            converted_parts.append(OpenAIChatCompletionContentPartTextParam(text=content_part.text))
@ -158,9 +160,11 @@ async def convert_response_input_to_chat_messages(
                    ),
                )
                messages.append(OpenAIAssistantMessageParam(tool_calls=[tool_call]))
+                # Output can be None, use empty string as fallback
+                output_content = input_item.output if input_item.output is not None else ""
                messages.append(
                    OpenAIToolMessageParam(
-                        content=input_item.output,
+                        content=output_content,
                        tool_call_id=input_item.id,
                    )
                )
@ -172,7 +176,8 @@ async def convert_response_input_to_chat_messages(
            ):
                # these are handled by the responses impl itself and not pass through to chat completions
                pass
-            else:
+            elif isinstance(input_item, OpenAIResponseMessage):
+                # Narrow type to OpenAIResponseMessage which has content and role attributes
                content = await convert_response_content_to_chat_content(input_item.content)
                message_type = await get_message_type_by_role(input_item.role)
                if message_type is None:
@ -191,7 +196,8 @@ async def convert_response_input_to_chat_messages(
                        last_user_content = getattr(last_user_msg, "content", None)
                        if last_user_content == content:
                            continue  # Skip duplicate user message
-                messages.append(message_type(content=content))
+                # Dynamic message type call - different message types have different content expectations
+                messages.append(message_type(content=content))  # type: ignore[call-arg,arg-type]
        if len(tool_call_results):
            # Check if unpaired function_call_outputs reference function_calls from previous messages
            if previous_messages:
@ -237,8 +243,11 @@ async def convert_response_text_to_chat_response_format(
    if text.format["type"] == "json_object":
        return OpenAIResponseFormatJSONObject()
    if text.format["type"] == "json_schema":
+        # Assert name exists for json_schema format
+        assert text.format.get("name"), "json_schema format requires a name"
+        schema_name: str = text.format["name"]  # type: ignore[assignment]
        return OpenAIResponseFormatJSONSchema(
-            json_schema=OpenAIJSONSchema(name=text.format["name"], schema=text.format["schema"])
+            json_schema=OpenAIJSONSchema(name=schema_name, schema=text.format["schema"])
        )
    raise ValueError(f"Unsupported text format: {text.format}")

@ -251,7 +260,7 @@ async def get_message_type_by_role(role: str) -> type[OpenAIMessageParam] | None
        "assistant": OpenAIAssistantMessageParam,
        "developer": OpenAIDeveloperMessageParam,
    }
-    return role_to_type.get(role)
+    return role_to_type.get(role)  # type: ignore[return-value]  # Pydantic models use ModelMetaclass


 def _extract_citations_from_text(
@ -320,7 +329,8 @@ async def run_guardrails(safety_api: Safety, messages: str, guardrail_ids: list[

    # Look up shields to get their provider_resource_id (actual model ID)
    model_ids = []
-    shields_list = await safety_api.routing_table.list_shields()
+    # TODO: list_shields not in Safety interface but available at runtime via API routing
+    shields_list = await safety_api.routing_table.list_shields()  # type: ignore[attr-defined]

    for guardrail_id in guardrail_ids:
        matching_shields = [shield for shield in shields_list.data if shield.identifier == guardrail_id]
@ -337,7 +347,9 @@ async def run_guardrails(safety_api: Safety, messages: str, guardrail_ids: list[
        for result in response.results:
            if result.flagged:
                message = result.user_message or "Content blocked by safety guardrails"
-                flagged_categories = [cat for cat, flagged in result.categories.items() if flagged]
+                flagged_categories = (
+                    [cat for cat, flagged in result.categories.items() if flagged] if result.categories else []
+                )
                violation_type = result.metadata.get("violation_type", []) if result.metadata else []

                if flagged_categories:
@ -347,6 +359,9 @@ async def run_guardrails(safety_api: Safety, messages: str, guardrail_ids: list[

                return message

+    # No violations found
+    return None
+

 def extract_guardrail_ids(guardrails: list | None) -> list[str]:
    """Extract guardrail IDs from guardrails parameter, handling both string IDs and ResponseGuardrailSpec objects."""
--- a/src/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -6,7 +6,7 @@

 import asyncio

-from llama_stack.apis.inference import Message
+from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
@ -31,7 +31,7 @@ class ShieldRunnerMixin:
        self.input_shields = input_shields
        self.output_shields = output_shields

-    async def run_multiple_shields(self, messages: list[Message], identifiers: list[str]) -> None:
+    async def run_multiple_shields(self, messages: list[OpenAIMessageParam], identifiers: list[str]) -> None:
        async def run_shield_with_span(identifier: str):
            async with tracing.span(f"run_shield_{identifier}"):
                return await self.safety_api.run_shield(
--- a/src/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/src/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -33,4 +33,5 @@ class AnthropicInferenceAdapter(OpenAIMixin):
        return "https://api.anthropic.com/v1"

    async def list_provider_model_ids(self) -> Iterable[str]:
-        return [m.id async for m in AsyncAnthropic(api_key=self.get_api_key()).models.list()]
+        api_key = self._get_api_key_from_config_or_provider_data()
+        return [m.id async for m in AsyncAnthropic(api_key=api_key).models.list()]
--- a/src/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py
@ -33,10 +33,11 @@ class DatabricksInferenceAdapter(OpenAIMixin):

    async def list_provider_model_ids(self) -> Iterable[str]:
        # Filter out None values from endpoint names
+        api_token = self._get_api_key_from_config_or_provider_data()
        return [
            endpoint.name  # type: ignore[misc]
            for endpoint in WorkspaceClient(
-                host=self.config.url, token=self.get_api_key()
+                host=self.config.url, token=api_token
            ).serving_endpoints.list()  # TODO: this is not async
        ]

--- a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -128,7 +128,9 @@ class LiteLLMOpenAIMixin(
        return schema

    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        input_dict = {}
+        from typing import Any
+
+        input_dict: dict[str, Any] = {}

        input_dict["messages"] = [
            await convert_message_to_openai_dict_new(m, download_images=self.download_images) for m in request.messages
@ -139,30 +141,27 @@ class LiteLLMOpenAIMixin(
                    f"Unsupported response format: {type(fmt)}. Only JsonSchemaResponseFormat is supported."
                )

-            fmt = fmt.json_schema
-            name = fmt["title"]
-            del fmt["title"]
-            fmt["additionalProperties"] = False
+            # Convert to dict for manipulation
+            fmt_dict = dict(fmt.json_schema)
+            name = fmt_dict["title"]
+            del fmt_dict["title"]
+            fmt_dict["additionalProperties"] = False

            # Apply additionalProperties: False recursively to all objects
-            fmt = self._add_additional_properties_recursive(fmt)
+            fmt_dict = self._add_additional_properties_recursive(fmt_dict)

            input_dict["response_format"] = {
                "type": "json_schema",
                "json_schema": {
                    "name": name,
-                    "schema": fmt,
+                    "schema": fmt_dict,
                    "strict": self.json_schema_strict,
                },
            }
        if request.tools:
            input_dict["tools"] = [convert_tooldef_to_openai_tool(tool) for tool in request.tools]
-            if request.tool_config.tool_choice:
-                input_dict["tool_choice"] = (
-                    request.tool_config.tool_choice.value
-                    if isinstance(request.tool_config.tool_choice, ToolChoice)
-                    else request.tool_config.tool_choice
-                )
+            if request.tool_config and (tool_choice := request.tool_config.tool_choice):
+                input_dict["tool_choice"] = tool_choice.value if isinstance(tool_choice, ToolChoice) else tool_choice

        return {
            "model": request.model,
@ -176,10 +175,10 @@ class LiteLLMOpenAIMixin(
    def get_api_key(self) -> str:
        provider_data = self.get_request_provider_data()
        key_field = self.provider_data_api_key_field
-        if provider_data and getattr(provider_data, key_field, None):
-            api_key = getattr(provider_data, key_field)
-        else:
-            api_key = self.api_key_from_config
+        if provider_data and key_field and (api_key := getattr(provider_data, key_field, None)):
+            return str(api_key)  # type: ignore[no-any-return]  # getattr returns Any, can't narrow without runtime type inspection
+
+        api_key = self.api_key_from_config
        if not api_key:
            raise ValueError(
                "API key is not set. Please provide a valid API key in the "
@ -192,7 +191,13 @@ class LiteLLMOpenAIMixin(
        self,
        params: OpenAIEmbeddingsRequestWithExtraBody,
    ) -> OpenAIEmbeddingsResponse:
+        if not self.model_store:
+            raise ValueError("Model store is not initialized")
+
        model_obj = await self.model_store.get_model(params.model)
+        if model_obj.provider_resource_id is None:
+            raise ValueError(f"Model {params.model} has no provider_resource_id")
+        provider_resource_id = model_obj.provider_resource_id

        # Convert input to list if it's a string
        input_list = [params.input] if isinstance(params.input, str) else params.input
@ -200,7 +205,7 @@ class LiteLLMOpenAIMixin(
        # Call litellm embedding function
        # litellm.drop_params = True
        response = litellm.embedding(
-            model=self.get_litellm_model_name(model_obj.provider_resource_id),
+            model=self.get_litellm_model_name(provider_resource_id),
            input=input_list,
            api_key=self.get_api_key(),
            api_base=self.api_base,
@ -217,7 +222,7 @@ class LiteLLMOpenAIMixin(

        return OpenAIEmbeddingsResponse(
            data=data,
-            model=model_obj.provider_resource_id,
+            model=provider_resource_id,
            usage=usage,
        )

@ -225,10 +230,16 @@ class LiteLLMOpenAIMixin(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
+        if not self.model_store:
+            raise ValueError("Model store is not initialized")
+
        model_obj = await self.model_store.get_model(params.model)
+        if model_obj.provider_resource_id is None:
+            raise ValueError(f"Model {params.model} has no provider_resource_id")
+        provider_resource_id = model_obj.provider_resource_id

        request_params = await prepare_openai_completion_params(
-            model=self.get_litellm_model_name(model_obj.provider_resource_id),
+            model=self.get_litellm_model_name(provider_resource_id),
            prompt=params.prompt,
            best_of=params.best_of,
            echo=params.echo,
@ -249,7 +260,8 @@ class LiteLLMOpenAIMixin(
            api_key=self.get_api_key(),
            api_base=self.api_base,
        )
-        return await litellm.atext_completion(**request_params)
+        # LiteLLM returns compatible type but mypy can't verify external library
+        return await litellm.atext_completion(**request_params)  # type: ignore[no-any-return]  # external lib lacks type stubs

    async def openai_chat_completion(
        self,
@ -265,10 +277,16 @@ class LiteLLMOpenAIMixin(
            elif "include_usage" not in stream_options:
                stream_options = {**stream_options, "include_usage": True}

+        if not self.model_store:
+            raise ValueError("Model store is not initialized")
+
        model_obj = await self.model_store.get_model(params.model)
+        if model_obj.provider_resource_id is None:
+            raise ValueError(f"Model {params.model} has no provider_resource_id")
+        provider_resource_id = model_obj.provider_resource_id

        request_params = await prepare_openai_completion_params(
-            model=self.get_litellm_model_name(model_obj.provider_resource_id),
+            model=self.get_litellm_model_name(provider_resource_id),
            messages=params.messages,
            frequency_penalty=params.frequency_penalty,
            function_call=params.function_call,
@ -294,7 +312,8 @@ class LiteLLMOpenAIMixin(
            api_key=self.get_api_key(),
            api_base=self.api_base,
        )
-        return await litellm.acompletion(**request_params)
+        # LiteLLM returns compatible type but mypy can't verify external library
+        return await litellm.acompletion(**request_params)  # type: ignore[no-any-return]  # external lib lacks type stubs

    async def check_model_availability(self, model: str) -> bool:
        """
--- a/src/llama_stack/providers/utils/inference/openai_compat.py
+++ b/src/llama_stack/providers/utils/inference/openai_compat.py
@ -161,8 +161,10 @@ def get_sampling_strategy_options(params: SamplingParams) -> dict:
    if isinstance(params.strategy, GreedySamplingStrategy):
        options["temperature"] = 0.0
    elif isinstance(params.strategy, TopPSamplingStrategy):
-        options["temperature"] = params.strategy.temperature
-        options["top_p"] = params.strategy.top_p
+        if params.strategy.temperature is not None:
+            options["temperature"] = params.strategy.temperature
+        if params.strategy.top_p is not None:
+            options["top_p"] = params.strategy.top_p
    elif isinstance(params.strategy, TopKSamplingStrategy):
        options["top_k"] = params.strategy.top_k
    else:
@ -192,12 +194,12 @@ def get_sampling_options(params: SamplingParams | None) -> dict:

 def text_from_choice(choice) -> str:
    if hasattr(choice, "delta") and choice.delta:
-        return choice.delta.content
+        return choice.delta.content  # type: ignore[no-any-return]  # external OpenAI types lack precise annotations

    if hasattr(choice, "message"):
-        return choice.message.content
+        return choice.message.content  # type: ignore[no-any-return]  # external OpenAI types lack precise annotations

-    return choice.text
+    return choice.text  # type: ignore[no-any-return]  # external OpenAI types lack precise annotations


 def get_stop_reason(finish_reason: str) -> StopReason:
@ -216,7 +218,7 @@ def convert_openai_completion_logprobs(
 ) -> list[TokenLogProbs] | None:
    if not logprobs:
        return None
-    if hasattr(logprobs, "top_logprobs"):
+    if hasattr(logprobs, "top_logprobs") and logprobs.top_logprobs:
        return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]

    # Together supports logprobs with top_k=1 only. This means for each token position,
@ -236,7 +238,7 @@ def convert_openai_completion_logprobs_stream(text: str, logprobs: float | OpenA
    if isinstance(logprobs, float):
        # Adapt response from Together CompletionChoicesChunk
        return [TokenLogProbs(logprobs_by_token={text: logprobs})]
-    if hasattr(logprobs, "top_logprobs"):
+    if hasattr(logprobs, "top_logprobs") and logprobs.top_logprobs:
        return [TokenLogProbs(logprobs_by_token=x) for x in logprobs.top_logprobs]
    return None

@ -245,23 +247,24 @@ def process_completion_response(
    response: OpenAICompatCompletionResponse,
 ) -> CompletionResponse:
    choice = response.choices[0]
+    text = choice.text or ""
    # drop suffix <eot_id> if present and return stop reason as end of turn
-    if choice.text.endswith("<|eot_id|>"):
+    if text.endswith("<|eot_id|>"):
        return CompletionResponse(
            stop_reason=StopReason.end_of_turn,
-            content=choice.text[: -len("<|eot_id|>")],
+            content=text[: -len("<|eot_id|>")],
            logprobs=convert_openai_completion_logprobs(choice.logprobs),
        )
    # drop suffix <eom_id> if present and return stop reason as end of message
-    if choice.text.endswith("<|eom_id|>"):
+    if text.endswith("<|eom_id|>"):
        return CompletionResponse(
            stop_reason=StopReason.end_of_message,
-            content=choice.text[: -len("<|eom_id|>")],
+            content=text[: -len("<|eom_id|>")],
            logprobs=convert_openai_completion_logprobs(choice.logprobs),
        )
    return CompletionResponse(
-        stop_reason=get_stop_reason(choice.finish_reason),
-        content=choice.text,
+        stop_reason=get_stop_reason(choice.finish_reason or "stop"),
+        content=text,
        logprobs=convert_openai_completion_logprobs(choice.logprobs),
    )

@ -272,10 +275,10 @@ def process_chat_completion_response(
 ) -> ChatCompletionResponse:
    choice = response.choices[0]
    if choice.finish_reason == "tool_calls":
-        if not choice.message or not choice.message.tool_calls:
+        if not hasattr(choice, "message") or not choice.message or not choice.message.tool_calls:  # type: ignore[attr-defined]  # OpenAICompatCompletionChoice is runtime duck-typed
            raise ValueError("Tool calls are not present in the response")

-        tool_calls = [convert_tool_call(tool_call) for tool_call in choice.message.tool_calls]
+        tool_calls = [convert_tool_call(tool_call) for tool_call in choice.message.tool_calls]  # type: ignore[attr-defined]  # OpenAICompatCompletionChoice is runtime duck-typed
        if any(isinstance(tool_call, UnparseableToolCall) for tool_call in tool_calls):
            # If we couldn't parse a tool call, jsonify the tool calls and return them
            return ChatCompletionResponse(
@ -287,9 +290,11 @@ def process_chat_completion_response(
            )
        else:
            # Otherwise, return tool calls as normal
+            # Filter to only valid ToolCall objects
+            valid_tool_calls = [tc for tc in tool_calls if isinstance(tc, ToolCall)]
            return ChatCompletionResponse(
                completion_message=CompletionMessage(
-                    tool_calls=tool_calls,
+                    tool_calls=valid_tool_calls,
                    stop_reason=StopReason.end_of_turn,
                    # Content is not optional
                    content="",
@ -299,7 +304,7 @@ def process_chat_completion_response(

    # TODO: This does not work well with tool calls for vLLM remote provider
    #   Ref: https://github.com/meta-llama/llama-stack/issues/1058
-    raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason))
+    raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason or "stop"))

    # NOTE: If we do not set tools in chat-completion request, we should not
    # expect the ToolCall in the response. Instead, we should return the raw
@ -324,8 +329,8 @@ def process_chat_completion_response(

    return ChatCompletionResponse(
        completion_message=CompletionMessage(
-            content=raw_message.content,
-            stop_reason=raw_message.stop_reason,
+            content=raw_message.content,  # type: ignore[arg-type]  # decode_assistant_message returns Union[str, InterleavedContent]
+            stop_reason=raw_message.stop_reason or StopReason.end_of_turn,
            tool_calls=raw_message.tool_calls,
        ),
        logprobs=None,
@ -448,7 +453,7 @@ async def process_chat_completion_stream_response(
            )

    # parse tool calls and report errors
-    message = decode_assistant_message(buffer, stop_reason)
+    message = decode_assistant_message(buffer, stop_reason or StopReason.end_of_turn)

    parsed_tool_calls = len(message.tool_calls) > 0
    if ipython and not parsed_tool_calls:
@ -463,7 +468,7 @@ async def process_chat_completion_stream_response(
            )
        )

-    request_tools = {t.tool_name: t for t in request.tools}
+    request_tools = {t.tool_name: t for t in (request.tools or [])}
    for tool_call in message.tool_calls:
        if tool_call.tool_name in request_tools:
            yield ChatCompletionResponseStreamChunk(
@ -525,7 +530,7 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
    }

    if hasattr(message, "tool_calls") and message.tool_calls:
-        result["tool_calls"] = []
+        tool_calls_list = []
        for tc in message.tool_calls:
            # The tool.tool_name can be a str or a BuiltinTool enum. If
            # it's the latter, convert to a string.
@ -533,7 +538,7 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
            if isinstance(tool_name, BuiltinTool):
                tool_name = tool_name.value

-            result["tool_calls"].append(
+            tool_calls_list.append(
                {
                    "id": tc.call_id,
                    "type": "function",
@ -543,6 +548,7 @@ async def convert_message_to_openai_dict(message: Message, download: bool = Fals
                    },
                }
            )
+        result["tool_calls"] = tool_calls_list  # type: ignore[assignment]  # dict allows Any value, stricter type expected
    return result


@ -608,7 +614,7 @@ async def convert_message_to_openai_dict_new(
                    ),
                )
            elif isinstance(content_, list):
-                return [await impl(item) for item in content_]
+                return [await impl(item) for item in content_]  # type: ignore[misc]  # recursive list comprehension confuses mypy's type narrowing
            else:
                raise ValueError(f"Unsupported content type: {type(content_)}")

@ -620,7 +626,7 @@ async def convert_message_to_openai_dict_new(
        else:
            return [ret]

-    out: OpenAIChatCompletionMessage = None
+    out: OpenAIChatCompletionMessage
    if isinstance(message, UserMessage):
        out = OpenAIChatCompletionUserMessage(
            role="user",
@ -636,7 +642,7 @@ async def convert_message_to_openai_dict_new(
                ),
                type="function",
            )
-            for tool in message.tool_calls
+            for tool in (message.tool_calls or [])
        ]
        params = {}
        if tool_calls:
@ -644,18 +650,18 @@ async def convert_message_to_openai_dict_new(
        out = OpenAIChatCompletionAssistantMessage(
            role="assistant",
            content=await _convert_message_content(message.content),
-            **params,
+            **params,  # type: ignore[typeddict-item]  # tool_calls dict expansion conflicts with TypedDict optional field
        )
    elif isinstance(message, ToolResponseMessage):
        out = OpenAIChatCompletionToolMessage(
            role="tool",
            tool_call_id=message.call_id,
-            content=await _convert_message_content(message.content),
+            content=await _convert_message_content(message.content),  # type: ignore[typeddict-item]  # content union type incompatible with TypedDict str requirement
        )
    elif isinstance(message, SystemMessage):
        out = OpenAIChatCompletionSystemMessage(
            role="system",
-            content=await _convert_message_content(message.content),
+            content=await _convert_message_content(message.content),  # type: ignore[typeddict-item]  # content union type incompatible with TypedDict str requirement
        )
    else:
        raise ValueError(f"Unsupported message type: {type(message)}")
@ -758,16 +764,16 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
    function = out["function"]

    if isinstance(tool.tool_name, BuiltinTool):
-        function["name"] = tool.tool_name.value
+        function["name"] = tool.tool_name.value  # type: ignore[index]  # dict value inferred as Any but mypy sees Collection[str]
    else:
-        function["name"] = tool.tool_name
+        function["name"] = tool.tool_name  # type: ignore[index]  # dict value inferred as Any but mypy sees Collection[str]

    if tool.description:
-        function["description"] = tool.description
+        function["description"] = tool.description  # type: ignore[index]  # dict value inferred as Any but mypy sees Collection[str]

    if tool.input_schema:
        # Pass through the entire JSON Schema as-is
-        function["parameters"] = tool.input_schema
+        function["parameters"] = tool.input_schema  # type: ignore[index]  # dict value inferred as Any but mypy sees Collection[str]

    # NOTE: OpenAI does not support output_schema, so we drop it here
    # It's stored in LlamaStack for validation and other provider usage
@ -815,15 +821,15 @@ def _convert_openai_request_tool_config(tool_choice: str | dict[str, Any] | None
    tool_config = ToolConfig()
    if tool_choice:
        try:
-            tool_choice = ToolChoice(tool_choice)
+            tool_choice = ToolChoice(tool_choice)  # type: ignore[assignment]  # reassigning to enum narrows union but mypy can't track after exception
        except ValueError:
            pass
-        tool_config.tool_choice = tool_choice
+        tool_config.tool_choice = tool_choice  # type: ignore[assignment]  # ToolConfig.tool_choice accepts Union[ToolChoice, dict] but mypy tracks narrower type
    return tool_config


 def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) -> list[ToolDefinition]:
-    lls_tools = []
+    lls_tools: list[ToolDefinition] = []
    if not tools:
        return lls_tools

@ -843,16 +849,16 @@ def _convert_openai_request_tools(tools: list[dict[str, Any]] | None = None) ->


 def _convert_openai_request_response_format(
-    response_format: OpenAIResponseFormatParam = None,
+    response_format: OpenAIResponseFormatParam | None = None,
 ):
    if not response_format:
        return None
    # response_format can be a dict or a pydantic model
-    response_format = dict(response_format)
-    if response_format.get("type", "") == "json_schema":
+    response_format_dict = dict(response_format)  # type: ignore[arg-type]  # OpenAIResponseFormatParam union needs dict conversion
+    if response_format_dict.get("type", "") == "json_schema":
        return JsonSchemaResponseFormat(
-            type="json_schema",
-            json_schema=response_format.get("json_schema", {}).get("schema", ""),
+            type="json_schema",  # type: ignore[arg-type]  # Literal["json_schema"] incompatible with expected type
+            json_schema=response_format_dict.get("json_schema", {}).get("schema", ""),
        )
    return None

@ -938,16 +944,15 @@ def _convert_openai_sampling_params(

    # Map an explicit temperature of 0 to greedy sampling
    if temperature == 0:
-        strategy = GreedySamplingStrategy()
+        sampling_params.strategy = GreedySamplingStrategy()
    else:
        # OpenAI defaults to 1.0 for temperature and top_p if unset
        if temperature is None:
            temperature = 1.0
        if top_p is None:
            top_p = 1.0
-        strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
+        sampling_params.strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)  # type: ignore[assignment]  # SamplingParams.strategy union accepts this type

-    sampling_params.strategy = strategy
    return sampling_params


@ -957,23 +962,24 @@ def openai_messages_to_messages(
    """
    Convert a list of OpenAIChatCompletionMessage into a list of Message.
    """
-    converted_messages = []
+    converted_messages: list[Message] = []
    for message in messages:
+        converted_message: Message
        if message.role == "system":
-            converted_message = SystemMessage(content=openai_content_to_content(message.content))
+            converted_message = SystemMessage(content=openai_content_to_content(message.content))  # type: ignore[arg-type]  # OpenAI SDK uses aliased types internally that mypy sees as incompatible with base types
        elif message.role == "user":
-            converted_message = UserMessage(content=openai_content_to_content(message.content))
+            converted_message = UserMessage(content=openai_content_to_content(message.content))  # type: ignore[arg-type]  # OpenAI SDK uses aliased types internally that mypy sees as incompatible with base types
        elif message.role == "assistant":
            converted_message = CompletionMessage(
-                content=openai_content_to_content(message.content),
-                tool_calls=_convert_openai_tool_calls(message.tool_calls),
+                content=openai_content_to_content(message.content),  # type: ignore[arg-type]  # OpenAI SDK uses aliased types internally that mypy sees as incompatible with base types
+                tool_calls=_convert_openai_tool_calls(message.tool_calls) if message.tool_calls else [],  # type: ignore[arg-type]  # OpenAI tool_calls type incompatible with conversion function
                stop_reason=StopReason.end_of_turn,
            )
        elif message.role == "tool":
            converted_message = ToolResponseMessage(
                role="tool",
                call_id=message.tool_call_id,
-                content=openai_content_to_content(message.content),
+                content=openai_content_to_content(message.content),  # type: ignore[arg-type]  # OpenAI SDK uses aliased types internally that mypy sees as incompatible with base types
            )
        else:
            raise ValueError(f"Unknown role {message.role}")
@ -990,9 +996,9 @@ def openai_content_to_content(content: str | Iterable[OpenAIChatCompletionConten
        return [openai_content_to_content(c) for c in content]
    elif hasattr(content, "type"):
        if content.type == "text":
-            return TextContentItem(type="text", text=content.text)
+            return TextContentItem(type="text", text=content.text)  # type: ignore[attr-defined]  # Iterable narrowed by hasattr check but mypy doesn't track
        elif content.type == "image_url":
-            return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))
+            return ImageContentItem(type="image", image=_URLOrData(url=URL(uri=content.image_url.url)))  # type: ignore[attr-defined]  # Iterable narrowed by hasattr check but mypy doesn't track
        else:
            raise ValueError(f"Unknown content type: {content.type}")
    else:
@ -1041,9 +1047,9 @@ def convert_openai_chat_completion_choice(
        completion_message=CompletionMessage(
            content=choice.message.content or "",  # CompletionMessage content is not optional
            stop_reason=_convert_openai_finish_reason(choice.finish_reason),
-            tool_calls=_convert_openai_tool_calls(choice.message.tool_calls),
+            tool_calls=_convert_openai_tool_calls(choice.message.tool_calls) if choice.message.tool_calls else [],  # type: ignore[arg-type]  # OpenAI tool_calls Optional type broadens union
        ),
-        logprobs=_convert_openai_logprobs(getattr(choice, "logprobs", None)),
+        logprobs=_convert_openai_logprobs(getattr(choice, "logprobs", None)),  # type: ignore[arg-type]  # getattr returns Any, can't narrow without inspection
    )


@ -1070,7 +1076,7 @@ async def convert_openai_chat_completion_stream(
        choice = chunk.choices[0]  # assuming only one choice per chunk

        # we assume there's only one finish_reason in the stream
-        stop_reason = _convert_openai_finish_reason(choice.finish_reason) or stop_reason
+        stop_reason = _convert_openai_finish_reason(choice.finish_reason) if choice.finish_reason else stop_reason
        logprobs = getattr(choice, "logprobs", None)

        # if there's a tool call, emit an event for each tool in the list
@ -1083,7 +1089,7 @@ async def convert_openai_chat_completion_stream(
                    event=ChatCompletionResponseEvent(
                        event_type=event_type,
                        delta=TextDelta(text=choice.delta.content),
-                        logprobs=_convert_openai_logprobs(logprobs),
+                        logprobs=_convert_openai_logprobs(logprobs),  # type: ignore[arg-type]  # logprobs type broadened from getattr result
                    )
                )

@ -1101,10 +1107,10 @@ async def convert_openai_chat_completion_stream(
                        event=ChatCompletionResponseEvent(
                            event_type=event_type,
                            delta=ToolCallDelta(
-                                tool_call=_convert_openai_tool_calls([tool_call])[0],
+                                tool_call=_convert_openai_tool_calls([tool_call])[0],  # type: ignore[arg-type, list-item]  # delta tool_call type differs from complete tool_call
                                parse_status=ToolCallParseStatus.succeeded,
                            ),
-                            logprobs=_convert_openai_logprobs(logprobs),
+                            logprobs=_convert_openai_logprobs(logprobs),  # type: ignore[arg-type]  # logprobs type broadened from getattr result
                        )
                    )
            else:
@ -1125,12 +1131,15 @@ async def convert_openai_chat_completion_stream(
                        if tool_call.function.name:
                            buffer["name"] = tool_call.function.name
                            delta = f"{buffer['name']}("
-                            buffer["content"] += delta
+                            if buffer["content"] is not None:
+                                buffer["content"] += delta

                        if tool_call.function.arguments:
                            delta = tool_call.function.arguments
-                            buffer["arguments"] += delta
-                            buffer["content"] += delta
+                            if buffer["arguments"] is not None and delta:
+                                buffer["arguments"] += delta
+                            if buffer["content"] is not None and delta:
+                                buffer["content"] += delta

                        yield ChatCompletionResponseStreamChunk(
                            event=ChatCompletionResponseEvent(
@ -1139,7 +1148,7 @@ async def convert_openai_chat_completion_stream(
                                    tool_call=delta,
                                    parse_status=ToolCallParseStatus.in_progress,
                                ),
-                                logprobs=_convert_openai_logprobs(logprobs),
+                                logprobs=_convert_openai_logprobs(logprobs),  # type: ignore[arg-type]  # logprobs type broadened from getattr result
                            )
                        )
        elif choice.delta.content:
@ -1147,7 +1156,7 @@ async def convert_openai_chat_completion_stream(
                event=ChatCompletionResponseEvent(
                    event_type=event_type,
                    delta=TextDelta(text=choice.delta.content or ""),
-                    logprobs=_convert_openai_logprobs(logprobs),
+                    logprobs=_convert_openai_logprobs(logprobs),  # type: ignore[arg-type]  # logprobs type broadened from getattr result
                )
            )

@ -1155,7 +1164,8 @@ async def convert_openai_chat_completion_stream(
        logger.debug(f"toolcall_buffer[{idx}]: {buffer}")
        if buffer["name"]:
            delta = ")"
-            buffer["content"] += delta
+            if buffer["content"] is not None:
+                buffer["content"] += delta
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=event_type,
@ -1168,16 +1178,16 @@ async def convert_openai_chat_completion_stream(
            )

            try:
-                tool_call = ToolCall(
-                    call_id=buffer["call_id"],
-                    tool_name=buffer["name"],
-                    arguments=buffer["arguments"],
+                parsed_tool_call = ToolCall(
+                    call_id=buffer["call_id"] or "",
+                    tool_name=buffer["name"] or "",
+                    arguments=buffer["arguments"] or "",
                )
                yield ChatCompletionResponseStreamChunk(
                    event=ChatCompletionResponseEvent(
                        event_type=ChatCompletionResponseEventType.progress,
                        delta=ToolCallDelta(
-                            tool_call=tool_call,
+                            tool_call=parsed_tool_call,  # type: ignore[arg-type]  # ToolCallDelta.tool_call accepts Union[str, ToolCall]
                            parse_status=ToolCallParseStatus.succeeded,
                        ),
                        stop_reason=stop_reason,
@ -1189,7 +1199,7 @@ async def convert_openai_chat_completion_stream(
                    event=ChatCompletionResponseEvent(
                        event_type=ChatCompletionResponseEventType.progress,
                        delta=ToolCallDelta(
-                            tool_call=buffer["content"],
+                            tool_call=buffer["content"],  # type: ignore[arg-type]  # ToolCallDelta.tool_call accepts Union[str, ToolCall]
                            parse_status=ToolCallParseStatus.failed,
                        ),
                        stop_reason=stop_reason,
@ -1250,7 +1260,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        messages = openai_messages_to_messages(messages)
+        messages = openai_messages_to_messages(messages)  # type: ignore[assignment]  # converted from OpenAI to LlamaStack message format
        response_format = _convert_openai_request_response_format(response_format)
        sampling_params = _convert_openai_sampling_params(
            max_tokens=max_tokens,
@ -1259,15 +1269,15 @@ class OpenAIChatCompletionToLlamaStackMixin:
        )
        tool_config = _convert_openai_request_tool_config(tool_choice)

-        tools = _convert_openai_request_tools(tools)
+        tools = _convert_openai_request_tools(tools)  # type: ignore[assignment]  # converted from OpenAI to LlamaStack tool format
        if tool_config.tool_choice == ToolChoice.none:
-            tools = []
+            tools = []  # type: ignore[assignment]  # empty list narrows return type but mypy tracks broader type

        outstanding_responses = []
        # "n" is the number of completions to generate per prompt
        n = n or 1
        for _i in range(0, n):
-            response = self.chat_completion(
+            response = self.chat_completion(  # type: ignore[attr-defined]  # mixin expects class to implement chat_completion
                model_id=model,
                messages=messages,
                sampling_params=sampling_params,
@ -1279,7 +1289,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
            outstanding_responses.append(response)

        if stream:
-            return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
+            return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)  # type: ignore[no-any-return]  # mixin async generator return type too complex for mypy

        return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
            self, model, outstanding_responses
@ -1295,14 +1305,16 @@ class OpenAIChatCompletionToLlamaStackMixin:
            response = await outstanding_response
            async for chunk in response:
                event = chunk.event
-                finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
+                finish_reason = (
+                    _convert_stop_reason_to_openai_finish_reason(event.stop_reason) if event.stop_reason else None
+                )

                if isinstance(event.delta, TextDelta):
                    text_delta = event.delta.text
                    delta = OpenAIChoiceDelta(content=text_delta)
                    yield OpenAIChatCompletionChunk(
                        id=id,
-                        choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],
+                        choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],  # type: ignore[arg-type]  # finish_reason Optional[str] incompatible with Literal union
                        created=int(time.time()),
                        model=model,
                        object="chat.completion.chunk",
@ -1310,13 +1322,17 @@ class OpenAIChatCompletionToLlamaStackMixin:
                elif isinstance(event.delta, ToolCallDelta):
                    if event.delta.parse_status == ToolCallParseStatus.succeeded:
                        tool_call = event.delta.tool_call
+                        if isinstance(tool_call, str):
+                            continue

                        # First chunk includes full structure
                        openai_tool_call = OpenAIChoiceDeltaToolCall(
                            index=0,
                            id=tool_call.call_id,
                            function=OpenAIChoiceDeltaToolCallFunction(
-                                name=tool_call.tool_name,
+                                name=tool_call.tool_name
+                                if isinstance(tool_call.tool_name, str)
+                                else tool_call.tool_name.value,  # type: ignore[arg-type]  # enum .value extraction on Union confuses mypy
                                arguments="",
                            ),
                        )
@ -1324,7 +1340,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
                        yield OpenAIChatCompletionChunk(
                            id=id,
                            choices=[
-                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
+                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)  # type: ignore[arg-type]  # finish_reason Optional[str] incompatible with Literal union
                            ],
                            created=int(time.time()),
                            model=model,
@ -1341,7 +1357,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
                        yield OpenAIChatCompletionChunk(
                            id=id,
                            choices=[
-                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
+                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)  # type: ignore[arg-type]  # finish_reason Optional[str] incompatible with Literal union
                            ],
                            created=int(time.time()),
                            model=model,
@ -1351,7 +1367,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
    async def _process_non_stream_response(
        self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
    ) -> OpenAIChatCompletion:
-        choices = []
+        choices: list[OpenAIChatCompletionChoice] = []
        for outstanding_response in outstanding_responses:
            response = await outstanding_response
            completion_message = response.completion_message
@ -1360,14 +1376,14 @@ class OpenAIChatCompletionToLlamaStackMixin:

            choice = OpenAIChatCompletionChoice(
                index=len(choices),
-                message=message,
+                message=message,  # type: ignore[arg-type]  # OpenAIChatCompletionMessage union incompatible with narrower Message type
                finish_reason=finish_reason,
            )
-            choices.append(choice)
+            choices.append(choice)  # type: ignore[arg-type]  # OpenAIChatCompletionChoice type annotation mismatch

        return OpenAIChatCompletion(
            id=f"chatcmpl-{uuid.uuid4()}",
-            choices=choices,
+            choices=choices,  # type: ignore[arg-type]  # list[OpenAIChatCompletionChoice] union incompatible
            created=int(time.time()),
            model=model,
            object="chat.completion",
--- a/src/llama_stack/providers/utils/memory/vector_store.py
+++ b/src/llama_stack/providers/utils/memory/vector_store.py
@ -196,6 +196,7 @@ def make_overlapped_chunks(
        chunks.append(
            Chunk(
                content=chunk,
+                chunk_id=chunk_id,
                metadata=chunk_metadata,
                chunk_metadata=backend_chunk_metadata,
            )
--- a/src/llama_stack/strong_typing/inspection.py
+++ b/src/llama_stack/strong_typing/inspection.py
@ -430,6 +430,32 @@ def _unwrap_generic_list(typ: type[list[T]]) -> type[T]:
    return list_type  # type: ignore[no-any-return]


+def is_generic_sequence(typ: object) -> bool:
+    "True if the specified type is a generic Sequence, i.e. `Sequence[T]`."
+    import collections.abc
+
+    typ = unwrap_annotated_type(typ)
+    return typing.get_origin(typ) is collections.abc.Sequence
+
+
+def unwrap_generic_sequence(typ: object) -> type:
+    """
+    Extracts the item type of a Sequence type.
+
+    :param typ: The Sequence type `Sequence[T]`.
+    :returns: The item type `T`.
+    """
+
+    return rewrap_annotated_type(_unwrap_generic_sequence, typ)  # type: ignore[arg-type]
+
+
+def _unwrap_generic_sequence(typ: object) -> type:
+    "Extracts the item type of a Sequence type (e.g. returns `T` for `Sequence[T]`)."
+
+    (sequence_type,) = typing.get_args(typ)  # unpack single tuple element
+    return sequence_type  # type: ignore[no-any-return]
+
+
 def is_generic_set(typ: object) -> TypeGuard[type[set]]:
    "True if the specified type is a generic set, i.e. `Set[T]`."

--- a/src/llama_stack/strong_typing/name.py
+++ b/src/llama_stack/strong_typing/name.py
@ -18,10 +18,12 @@ from .inspection import (
    TypeLike,
    is_generic_dict,
    is_generic_list,
+    is_generic_sequence,
    is_type_optional,
    is_type_union,
    unwrap_generic_dict,
    unwrap_generic_list,
+    unwrap_generic_sequence,
    unwrap_optional_type,
    unwrap_union_types,
 )
@ -155,24 +157,28 @@ def python_type_to_name(data_type: TypeLike, force: bool = False) -> str:
    if metadata is not None:
        # type is Annotated[T, ...]
        arg = typing.get_args(data_type)[0]
-        return python_type_to_name(arg)
+        return python_type_to_name(arg, force=force)

    if force:
        # generic types
        if is_type_optional(data_type, strict=True):
-            inner_name = python_type_to_name(unwrap_optional_type(data_type))
+            inner_name = python_type_to_name(unwrap_optional_type(data_type), force=True)
            return f"Optional__{inner_name}"
        elif is_generic_list(data_type):
-            item_name = python_type_to_name(unwrap_generic_list(data_type))
+            item_name = python_type_to_name(unwrap_generic_list(data_type), force=True)
+            return f"List__{item_name}"
+        elif is_generic_sequence(data_type):
+            # Treat Sequence the same as List for schema generation purposes
+            item_name = python_type_to_name(unwrap_generic_sequence(data_type), force=True)
            return f"List__{item_name}"
        elif is_generic_dict(data_type):
            key_type, value_type = unwrap_generic_dict(data_type)
-            key_name = python_type_to_name(key_type)
-            value_name = python_type_to_name(value_type)
+            key_name = python_type_to_name(key_type, force=True)
+            value_name = python_type_to_name(value_type, force=True)
            return f"Dict__{key_name}__{value_name}"
        elif is_type_union(data_type):
            member_types = unwrap_union_types(data_type)
-            member_names = "__".join(python_type_to_name(member_type) for member_type in member_types)
+            member_names = "__".join(python_type_to_name(member_type, force=True) for member_type in member_types)
            return f"Union__{member_names}"

    # named system or user-defined type
--- a/src/llama_stack/strong_typing/schema.py
+++ b/src/llama_stack/strong_typing/schema.py
@ -111,7 +111,7 @@ def get_class_property_docstrings(
 def docstring_to_schema(data_type: type) -> Schema:
    short_description, long_description = get_class_docstrings(data_type)
    schema: Schema = {
-        "title": python_type_to_name(data_type),
+        "title": python_type_to_name(data_type, force=True),
    }

    description = "\n".join(filter(None, [short_description, long_description]))
@ -417,6 +417,10 @@ class JsonSchemaGenerator:
        if origin_type is list:
            (list_type,) = typing.get_args(typ)  # unpack single tuple element
            return {"type": "array", "items": self.type_to_schema(list_type)}
+        elif origin_type is collections.abc.Sequence:
+            # Treat Sequence the same as list for JSON schema (both are arrays)
+            (sequence_type,) = typing.get_args(typ)  # unpack single tuple element
+            return {"type": "array", "items": self.type_to_schema(sequence_type)}
        elif origin_type is dict:
            key_type, value_type = typing.get_args(typ)
            if not (key_type is str or key_type is int or is_type_enum(key_type)):
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -39,7 +39,7 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
    if provider.provider_type in (
        "inline::meta-reference",
        "inline::sentence-transformers",
-        "inline::vllm",
+        "remote::vllm",
        "remote::bedrock",
        "remote::databricks",
        # Technically Nvidia does support OpenAI completions, but none of their hosted models
@ -120,7 +120,7 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
    if provider.provider_type in (
        "inline::meta-reference",
        "inline::sentence-transformers",
-        "inline::vllm",
+        "remote::vllm",
        "remote::bedrock",
        "remote::databricks",
        "remote::cerebras",
--- a/tests/integration/responses/recordings/00cc2202e2906845aec8fe97f0e31e55abd32a289a516722ccab502c4e312c2c.json
+++ b/tests/integration/responses/recordings/00cc2202e2906845aec8fe97f0e31e55abd32a289a516722ccab502c4e312c2c.json
--- a/tests/integration/responses/recordings/05434d44cd8a093bcb70e8978fc7a35ecf13e14a531c0d08c025e798dc6796d7.json
+++ b/tests/integration/responses/recordings/05434d44cd8a093bcb70e8978fc7a35ecf13e14a531c0d08c025e798dc6796d7.json
@ -0,0 +1,763 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_non_streaming_file_search[openai_client-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768-llama_experts]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_UKFNZA0eSkL6fZHbs8ygBd5W",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_UKFNZA0eSkL6fZHbs8ygBd5W",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-861837565219, score: 0.015252742239920682, attributes: {'filename': 'test_response_non_streaming_file_search.txt', 'chunk_id': '869ae0c0-ab85-ca6f-e5d0-024381443c27', 'document_id': 'file-861837565219', 'token_count': 10.0, 'metadata_token_count': 13.0} (cite as <|file-861837565219|>)\nLlama 4 Maverick has 128 experts\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "OEZj77MujzEilF"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "aZ37vwWHFrpGy"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " L",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "csghpwq82thpEG"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "lama",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "1dRxATyjFkzZ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "DkAEGxNVXrhL9KJ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "4",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "SI7v0ofTi6JL0LP"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " Maver",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "tThgm0YItJ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "ick",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "5UnIV9ZM2koPE"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " model",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "pFPs5HfBSA"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " has",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CIT42IHpAEgx"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "jpXixTaXlYSxTu3"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "128",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "IBEKia6bwNtLB"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "hHMPPr4Q"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "iGTIWlxj9c2Equ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "wfQImUZLNC8Dtgc"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "m21wFuqSLpMN"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CP5N1QxHqEnzbnq"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "861",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "jgQZ9egEpAiQv"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "837",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "viNedPoe13lJJ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "565",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "j2gGBSzOagN98"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "219",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "d4iMNITon2xM3"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "67lYY4LnZsfKd3U"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "bMllpJPicr01Ip"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "ZgWEFMbo3w"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-05434d44cd8a",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 23,
+            "prompt_tokens": 352,
+            "total_tokens": 375,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "Wwt10anxWJDla"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/2f5d0087ba947141d94b9ba6462c03ff01d8f4948fedd8fd84cabfa80f5f0373.json
+++ b/tests/integration/responses/recordings/2f5d0087ba947141d94b9ba6462c03ff01d8f4948fedd8fd84cabfa80f5f0373.json
--- a/tests/integration/responses/recordings/300c5041332a0ad2990a05df88a6b6842e02157d807564c136dc71cffe2b78cc.json
+++ b/tests/integration/responses/recordings/300c5041332a0ad2990a05df88a6b6842e02157d807564c136dc71cffe2b78cc.json
--- a/tests/integration/responses/recordings/40985d2e0ff82751e23c442f40346889f32138a4931f7ff275d6f93aaba6b21c.json
+++ b/tests/integration/responses/recordings/40985d2e0ff82751e23c442f40346889f32138a4931f7ff275d6f93aaba6b21c.json
@ -0,0 +1,767 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_non_streaming_file_search[openai_client-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768-llama_experts_pdf]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_M8gyYiB39MwYdJKc4aHIGbfA",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_M8gyYiB39MwYdJKc4aHIGbfA",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 2 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-379221123213, score: 0.008294223715346738, attributes: {'filename': 'llama_stack_and_models.pdf', 'chunk_id': 'c3556aea-3b73-0278-aa16-ebbdb4c18b18', 'document_id': 'file-379221123213', 'token_count': 98.0, 'metadata_token_count': 11.0} (cite as <|file-379221123213|>)\n,  \nhardware\n \nvendors,\n \nand\n \nAI-focused\n \ncompanies)\n \nthat\n \noffer\n \ntailored\n \ninfrastructure,\n \nsoftware,\n \nand\n \nservices\n \nfor\n \ndeploying\n \nLlama\n \nmodels.\n  \nLlama  4  Maverick  \n Llama  4  Maverick  is  a  Mixture-of-Experts  (MoE)  model  with  17  billion  active  parameters  and  128  experts.   \n"
+            },
+            {
+              "type": "text",
+              "text": "[2] document_id: file-379221123213, score: 0.0033899213359898477, attributes: {'filename': 'llama_stack_and_models.pdf', 'chunk_id': '16d99c69-8323-27ce-3bd7-7b51dcac2735', 'document_id': 'file-379221123213', 'token_count': 498.0, 'metadata_token_count': 11.0} (cite as <|file-379221123213|>)\nLlama  Stack  \nLlama  Stack  Overview  \nLlama  Stack  standardizes  the  core  building  blocks  that  simplify  AI  application  development.  It  codifies  best  \npractices\n \nacross\n \nthe\n \nLlama\n \necosystem.\n \nMore\n \nspecifically,\n \nit\n \nprovides\n  \u25cf  Unified  API  layer  for  Inference,  RAG,  Agents,  Tools,  Safety,  Evals,  and  Telemetry.  \u25cf  Plugin  architecture  to  support  the  rich  ecosystem  of  different  API  implementations  in  various  \nenvironments,\n \nincluding\n \nlocal\n \ndevelopment,\n \non-premises,\n \ncloud,\n \nand\n \nmobile.\n \u25cf  Prepackaged  verified  distributions  which  offer  a  one-stop  solution  for  developers  to  get  started  quickly  \nand\n \nreliably\n \nin\n \nany\n \nenvironment.\n \u25cf  Multiple  developer  interfaces  like  CLI  and  SDKs  for  Python,  Typescript,  iOS,  and  Android.  \u25cf  Standalone  applications  as  examples  for  how  to  build  production-grade  AI  applications  with  Llama  \nStack.\n \nLlama  Stack  Benefits  \n\u25cf  Flexible  Options:  Developers  can  choose  their  preferred  infrastructure  without  changing  APIs  and  enjoy  \nflexible\n \ndeployment\n \nchoices.\n \u25cf  Consistent  Experience:  With  its  unified  APIs,  Llama  Stack  makes  it  easier  to  build,  test,  and  deploy  AI  \napplications\n \nwith\n \nconsistent\n \napplication\n \nbehavior.\n \u25cf  Robust  Ecosystem:  Llama  Stack  is  already  integrated  with  distribution  partners  (cloud  providers,  \nhardware\n \nvendors,\n \nand\n \nAI-focused\n \ncompanies)\n \nthat\n \noffer\n \ntailored\n \ninfrastructure,\n \nsoftware,\n \nand\n \nservices\n \nfor\n \ndeploying\n \nLlama\n \nmodels.\n  \nLlama  4  Maverick  \n Llama  4  Maverick  is  a  Mixture-of-Experts  (MoE)  model  with  17  billion  active  parameters  and  128  experts.   \n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "SH6nRcfXzd8qPg"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "vbJu1mhpQKtNr"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " L",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "uAUiYAVpMW8Ph9"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "lama",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "DJxjs1HFugOD"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "sU2IncrauGmuYki"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "4",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "IkZbrWS45cqkmqi"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " Maver",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "YbZYhGgoGE"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "ick",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "7FtHnapGtkc09"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " model",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "8P3mUr7HfV"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " has",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "WxYXJUfkyxqZ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "E9hIXNC7oeJcZ8v"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "128",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "L9ww7cI1pSSt3"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "hHao5x7a"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "0cwygEJttBgv7M"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "KYVCnE5AA6MnQ0Y"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "N3DcYBcrQDzD"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CUpjI7Qo17k4aeo"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "379",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "s1694CAHwowUf"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "221",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "I94vCKkpQNsx6"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "123",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "RNfAfPtJK3KHE"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "213",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Gk04vo9RXpl3P"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "rkWPIUdNABAeP7V"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "GIF1vPXxInWrhl"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Oa1imYdRme"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-40985d2e0ff8",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 23,
+            "prompt_tokens": 1048,
+            "total_tokens": 1071,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "0Xx3txQF13S"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/40a41380ede0dd4635618b64a166f89979aa6c479a626155f36045a677abe944.json
+++ b/tests/integration/responses/recordings/40a41380ede0dd4635618b64a166f89979aa6c479a626155f36045a677abe944.json
--- a/tests/integration/responses/recordings/454a64d08460d26028e99324c80366c46163710158867d4f2178b3fe3b2f76a7.json
+++ b/tests/integration/responses/recordings/454a64d08460d26028e99324c80366c46163710158867d4f2178b3fe3b2f76a7.json
@ -0,0 +1,925 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_sequential_file_search[client_with_models-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_FzhOmTdZThRndI5rSASPdAqr",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_FzhOmTdZThRndI5rSASPdAqr",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-797509666839, score: 0.019272299825769716, attributes: {'filename': 'test_sequential_file_search.txt', 'chunk_id': '3907d885-d8e7-a72d-1113-f7080454d97c', 'document_id': 'file-797509666839', 'token_count': 19.0, 'metadata_token_count': 11.0} (cite as <|file-797509666839|>)\nThe Llama 4 Maverick model has 128 experts in its mixture of experts architecture.\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Lk9Xf7hCFPS2tT"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "14pQ6XFvX7eSh"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " L",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "gPEg73EpAxR3FC"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "lama",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "ZWJl6Mzcv95d"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "zEYaSNtwtGmhfwy"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "4",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "2tesGAvAkEOb8T6"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " Maver",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Hykn5kSQlG"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "ick",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "xWW13SGjSybVX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " model",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "fAZjisJ63a"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " has",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "FlTpZNfFG6rX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "9J9VrtXuLHug6II"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "128",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "0EckZGr823mA9"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "dW7O5HFR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "5dRdaDvaXumkV"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " its",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "kD1aZsGwZhMx"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " mixture",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "IpxDJF0p"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "WbnOG310xKaLq"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "sh58U2d8"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " architecture",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "El3"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "u3EtYZFJGaheZj"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "QjdqqIuk8c7wMUp"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Zqcwf53n0hUw"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "DfFLPM5V45QUiAm"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "797",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "55snCUEJgoLyX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "509",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "pCqEKhy1wq8Vl"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "666",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "c5QnCsKzuhFd0"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "839",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "jFSbryUeH7ZyA"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "uHktQBYsC92laeK"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "UUxHP1QGdz8MdR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "uExxZzWuXd"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-454a64d08460",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 29,
+            "prompt_tokens": 359,
+            "total_tokens": 388,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "EjpA6XzHVgcj8"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/4d749d8c25ad190e43e17c57cec0bf2f4641c80e86242a8021af9d041488b6a7.json
+++ b/tests/integration/responses/recordings/4d749d8c25ad190e43e17c57cec0bf2f4641c80e86242a8021af9d041488b6a7.json
@ -0,0 +1,631 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_sequential_file_search[client_with_models-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_FzhOmTdZThRndI5rSASPdAqr",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_FzhOmTdZThRndI5rSASPdAqr",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-797509666839, score: 0.019272299825769716, attributes: {'filename': 'test_sequential_file_search.txt', 'chunk_id': '3907d885-d8e7-a72d-1113-f7080454d97c', 'document_id': 'file-797509666839', 'token_count': 19.0, 'metadata_token_count': 11.0} (cite as <|file-797509666839|>)\nThe Llama 4 Maverick model has 128 experts in its mixture of experts architecture.\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "The Llama 4 Maverick model has 128 experts in its mixture of experts architecture <|file-797509666839|>."
+        },
+        {
+          "role": "user",
+          "content": "Can you tell me more about the architecture?"
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_y4Py1L2VscRQ5IBZ7gGpqpWv",
+                    "function": {
+                      "arguments": "",
+                      "name": "knowledge_search"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "iFdF"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "{\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "gIC"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "query",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "P"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\":\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "p"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "L",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "TAVud"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "lama",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " ",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "hHmE5"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "4",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CN4uS"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " Maver",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": ""
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "ick",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "0kI"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " model",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": ""
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " architecture",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "dyryTBF49"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\"}",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "BHV"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "qrKh"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-4d749d8c25ad",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 22,
+            "prompt_tokens": 404,
+            "total_tokens": 426,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "ecpBTD3qjc75r"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/5a3033c4d989d68cc418014d7b8ed7bbb5d6e538bd3620dec2f846e0c8fa52f8.json
+++ b/tests/integration/responses/recordings/5a3033c4d989d68cc418014d7b8ed7bbb5d6e538bd3620dec2f846e0c8fa52f8.json
--- a/tests/integration/responses/recordings/6d20aac5318b8bf5803c05c224e7ca6d5b5951df5408e6bca3d0ba2b963f2c73.json
+++ b/tests/integration/responses/recordings/6d20aac5318b8bf5803c05c224e7ca6d5b5951df5408e6bca3d0ba2b963f2c73.json
--- a/tests/integration/responses/recordings/6e5759a3bd65f94c5ec325ee211fcae819b51d6877edc656548d863bd9b5652e.json
+++ b/tests/integration/responses/recordings/6e5759a3bd65f94c5ec325ee211fcae819b51d6877edc656548d863bd9b5652e.json
--- a/tests/integration/responses/recordings/82038830a1ad60e4e01fb5efafd760b6327f0b7e6e7fa4e80518bff9f6002e8f.json
+++ b/tests/integration/responses/recordings/82038830a1ad60e4e01fb5efafd760b6327f0b7e6e7fa4e80518bff9f6002e8f.json
--- a/tests/integration/responses/recordings/882e7f0e5fcfe9f3276692c344dc2fee082b189494dd4f4829825adc90a79d9c.json
+++ b/tests/integration/responses/recordings/882e7f0e5fcfe9f3276692c344dc2fee082b189494dd4f4829825adc90a79d9c.json
--- a/tests/integration/responses/recordings/baa0ba98b7f3de76797bba6cf8294e3ee988fdd1be8385789959b81362ea8194.json
+++ b/tests/integration/responses/recordings/baa0ba98b7f3de76797bba6cf8294e3ee988fdd1be8385789959b81362ea8194.json
@ -0,0 +1,763 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_non_streaming_file_search[client_with_models-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768-llama_experts]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_gZXRKN1HMDC16NP9wNPAkP9K",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model experts count\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_gZXRKN1HMDC16NP9wNPAkP9K",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-864460993305, score: 0.011418752464355166, attributes: {'filename': 'test_response_non_streaming_file_search.txt', 'chunk_id': '869ae0c0-ab85-ca6f-e5d0-024381443c27', 'document_id': 'file-864460993305', 'token_count': 10.0, 'metadata_token_count': 13.0} (cite as <|file-864460993305|>)\nLlama 4 Maverick has 128 experts\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model experts count\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "VvS2zeV5Z8apdX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "NeElmbFuPxg9F"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " L",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "RA2Dv6fH3Xp28d"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "lama",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "mk2wpBSl9esL"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "WkghQrNy7WNFz7S"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "4",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "LOo1ya1Av8yejuX"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " Maver",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Uj02OVTEBb"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "ick",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "7s3FiwwwgzGhy"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " model",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "WExrPT6Yjd"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " has",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "vbf0YwoBbJsB"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "vYIgV2n0AuxwZ9F"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "128",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "lAS4gXrK4sNoq"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "90lGUcaB"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "mnFZfKgXWsjWZe"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "eOcwjhvK0vIp2nj"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "5TijFZHKoeGs"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "MWGjx7wiu4tdFha"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "864",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "k9VH32AhyY519"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "460",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "dWxZtp4i8KhxZ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "993",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "u2WHjDkGJE2hg"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "305",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "6fckZytfB9iS5"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "YGOP75uha3KyHao"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "emmym2mGHhvw9Q"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "GoEMFfNFBW"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-baa0ba98b7f3",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 23,
+            "prompt_tokens": 350,
+            "total_tokens": 373,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "ec6S325i8izl1"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/bb43ffac0034ef7fcca1786bcb53106b37f70f053c38a92e225f4107e48c9c72.json
+++ b/tests/integration/responses/recordings/bb43ffac0034ef7fcca1786bcb53106b37f70f053c38a92e225f4107e48c9c72.json
--- a/tests/integration/responses/recordings/bb8ad4fa0847c0b408d8bfeb6cc6bc65d4afece55df8e8187dfdbf75d57b13ba.json
+++ b/tests/integration/responses/recordings/bb8ad4fa0847c0b408d8bfeb6cc6bc65d4afece55df8e8187dfdbf75d57b13ba.json
--- a/tests/integration/responses/recordings/c0b147807a41960f5ba7bff3ea7ffdc2aa0497548e1a29486e6c18ae900cd335.json
+++ b/tests/integration/responses/recordings/c0b147807a41960f5ba7bff3ea7ffdc2aa0497548e1a29486e6c18ae900cd335.json
@ -0,0 +1,631 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_sequential_file_search[openai_client-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_4ac6gxccWFxDvEl8BizY3BJw",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_4ac6gxccWFxDvEl8BizY3BJw",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-528246887823, score: 0.019272299825769716, attributes: {'filename': 'test_sequential_file_search.txt', 'chunk_id': '3907d885-d8e7-a72d-1113-f7080454d97c', 'document_id': 'file-528246887823', 'token_count': 19.0, 'metadata_token_count': 11.0} (cite as <|file-528246887823|>)\nThe Llama 4 Maverick model has 128 experts in its mixture of experts architecture.\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        },
+        {
+          "role": "assistant",
+          "content": "The Llama 4 Maverick model has 128 experts in its mixture of experts architecture <|file-528246887823|>."
+        },
+        {
+          "role": "user",
+          "content": "Can you tell me more about the architecture?"
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_2dn6pQIic4tAhxL0Q3R9v9oy",
+                    "function": {
+                      "arguments": "",
+                      "name": "knowledge_search"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "U5u2"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "{\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "rC6"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "query",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "4"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\":\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "E"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "L",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "U1RKZ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "lama",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "N9"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " ",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "eCM84"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "4",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "RNtZo"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " Maver",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": ""
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "ick",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "OmQ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " model",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": ""
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": " architecture",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Hd8hPZl2u"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\"}",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "5bs"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "eMIj"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-c0b147807a41",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 22,
+            "prompt_tokens": 404,
+            "total_tokens": 426,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "ofat2LchRvz8V"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/cf185c8686348b2ba9ca6e45c2cdb631933a920ae003fe48e0b2579c271a1509.json
+++ b/tests/integration/responses/recordings/cf185c8686348b2ba9ca6e45c2cdb631933a920ae003fe48e0b2579c271a1509.json
@ -0,0 +1,925 @@
+{
+  "test_id": "tests/integration/responses/test_tool_responses.py::test_response_sequential_file_search[openai_client-txt=openai/gpt-4o:emb=sentence-transformers/nomic-ai/nomic-embed-text-v1.5:dim=768]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "How many experts does the Llama 4 Maverick model have?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_4ac6gxccWFxDvEl8BizY3BJw",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"Llama 4 Maverick model number of experts\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_4ac6gxccWFxDvEl8BizY3BJw",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-528246887823, score: 0.019272299825769716, attributes: {'filename': 'test_sequential_file_search.txt', 'chunk_id': '3907d885-d8e7-a72d-1113-f7080454d97c', 'document_id': 'file-528246887823', 'token_count': 19.0, 'metadata_token_count': 11.0} (cite as <|file-528246887823|>)\nThe Llama 4 Maverick model has 128 experts in its mixture of experts architecture.\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"Llama 4 Maverick model number of experts\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "DzrEfuLOuw4cnb"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CsVsWYnTMLfCu"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " L",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "45hLla9Dhdu3x9"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "lama",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "AhCUnf7tqKqC"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "gvAEwnHAgMzITVb"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "4",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "mGUFWICkd1S0jlx"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " Maver",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "e85JCyNVPe"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "ick",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "5vQf0h4IJTGGt"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " model",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "anovsNqaSC"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " has",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "fS6GYg8pBO8Q"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "vO7onsnvWf5kjUI"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "128",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "pdFjXciA0pN5w"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "eMMaKcAW"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "JFDRUy7B9ktO0"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " its",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "QlQIiohVPMVQ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " mixture",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "UuR2QmMR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "t0uvHdtkB4Fsl"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " experts",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "3G1KX2gw"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " architecture",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "x2J"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "fbLYZDlS7xvywf"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "vAxoGpf245DPeM8"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "gLu1ZShAlH4C"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "PdMvc8X2LtbhyFU"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "528",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "0S00nwBZD0Cah"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "246",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "fa7s8AYzHjMph"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "887",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "hrwMBgH8bsKYT"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "823",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "NBJ8yJWJjBCCQ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "AAzbONdy9ExzSBR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "THiCsk4cqjABWJ"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "rzm64SnHTE"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-cf185c868634",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 29,
+            "prompt_tokens": 359,
+            "total_tokens": 388,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "AnUv1BxAB2uOY"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/d6f74a7dd25a5c91ca381f4d816fc6cb87713fb11c7a33f897c159b5ed146b66.json
+++ b/tests/integration/responses/recordings/d6f74a7dd25a5c91ca381f4d816fc6cb87713fb11c7a33f897c159b5ed146b66.json
@ -0,0 +1,952 @@
+{
+  "test_id": "tests/integration/responses/test_file_search.py::test_response_file_search_filter_compound_and[client_with_models-txt=openai/gpt-4o]",
+  "request": {
+    "method": "POST",
+    "url": "https://api.openai.com/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-4o",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What are the engineering updates from the US?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "index": 0,
+              "id": "call_rST37XuKuJQcEBfmoTnNQzNe",
+              "type": "function",
+              "function": {
+                "name": "knowledge_search",
+                "arguments": "{\"query\":\"engineering updates from the US\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_rST37XuKuJQcEBfmoTnNQzNe",
+          "content": [
+            {
+              "type": "text",
+              "text": "knowledge_search tool found 1 chunks:\nBEGIN of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "[1] document_id: file-710505118847, score: 0.005345607610573921, attributes: {'region': 'us', 'category': 'engineering', 'date': 1680307200.0, 'filename': 'us_engineering_q2.txt', 'chunk_id': '084e15ad-480a-eae8-9242-391c53854867', 'document_id': 'file-710505118847', 'token_count': 18.0, 'metadata_token_count': 32.0} (cite as <|file-710505118847|>)\nUS technical updates for Q2 2023. New features deployed in the US region.\n"
+            },
+            {
+              "type": "text",
+              "text": "END of knowledge_search tool results.\n"
+            },
+            {
+              "type": "text",
+              "text": "The above results were retrieved to help answer the user's query: \"engineering updates from the US\". Use them as supporting information only in answering this query. Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). Do not add extra punctuation. Use only the file IDs provided (do not invent new ones).\n"
+            }
+          ]
+        }
+      ],
+      "stream": true,
+      "stream_options": {
+        "include_usage": true
+      },
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "knowledge_search",
+            "description": "Search for information in a database.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "query": {
+                  "type": "string",
+                  "description": "The query to search for. Can be a natural language sentence or keywords."
+                }
+              },
+              "required": [
+                "query"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-4o"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "CVT4TMzBPNlTqA"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Rlj8tcP3E7bOB"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " engineering",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "8lga"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " updates",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "6fwO0WkR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " from",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "BryajibrQvv"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "iTlMgikEguMP"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " US",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "79xbcCa6na7en"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " include",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "q7q4AkjT"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " new",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "fiyvaDyv5eet"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " features",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "cBkhZfR"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " deployed",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "EaW5Ixt"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " in",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "xLVfGMTiR4OMS"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "cncqZQApoIjH"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " region",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "yiSqVtnqF"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " for",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "sbDWGbV8OoYi"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " Q",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "E1ZJCGd5c2IH7b"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "2",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "agHXieAbH98A2VE"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " ",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Ht3DkQwQs7t32Aw"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "202",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "j4r88Vvqcm7VY"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "3",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "pv9GLKOSpa0BHEr"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": " <",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "iBXT8JWz9X1J1q"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "D1gi2w0f0DN5n3k"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "file",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "zxHM3I5wmPGU"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "-",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "Gl7oL62eU6xIrUp"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "710",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "l4RX4sx1BfQA6"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "505",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "AGyEWqU2sDL6e"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "118",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "BReQxn8kTEiA5"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "847",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "yN9PEtunpAkNv"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": "|",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "bKBLmRBkxlk61fP"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": ">.",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "077BDwQit7hWfz"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": null,
+          "obfuscation": "LOYztD3Yfb"
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-d6f74a7dd25a",
+          "choices": [],
+          "created": 0,
+          "model": "gpt-4o-2024-08-06",
+          "object": "chat.completion.chunk",
+          "service_tier": "default",
+          "system_fingerprint": "fp_a788c5aef0",
+          "usage": {
+            "completion_tokens": 30,
+            "prompt_tokens": 364,
+            "total_tokens": 394,
+            "completion_tokens_details": {
+              "accepted_prediction_tokens": 0,
+              "audio_tokens": 0,
+              "reasoning_tokens": 0,
+              "rejected_prediction_tokens": 0
+            },
+            "prompt_tokens_details": {
+              "audio_tokens": 0,
+              "cached_tokens": 0
+            }
+          },
+          "obfuscation": "9lHtlsx9YsVH6"
+        }
+      }
+    ],
+    "is_streaming": true
+  },
+  "id_normalization_mapping": {}
+}
--- a/tests/integration/responses/recordings/dd67347dee58190dea53588f8914211d279b80b6198cbe8b8b789fad2a0d0687.json
+++ b/tests/integration/responses/recordings/dd67347dee58190dea53588f8914211d279b80b6198cbe8b8b789fad2a0d0687.json
--- a/tests/integration/responses/recordings/f4cfc578243d8c3e2e61488bfcfc571cbc160a97dc570076a869d4fec1dc8c52.json
+++ b/tests/integration/responses/recordings/f4cfc578243d8c3e2e61488bfcfc571cbc160a97dc570076a869d4fec1dc8c52.json
--- a/tests/integration/responses/recordings/fa055fef7ea5386adaeaa5ddea61a417c161c49f64b9d92de0b96f4a892bc83c.json
+++ b/tests/integration/responses/recordings/fa055fef7ea5386adaeaa5ddea61a417c161c49f64b9d92de0b96f4a892bc83c.json
--- a/tests/integration/vector_io/test_openai_vector_stores.py
+++ b/tests/integration/vector_io/test_openai_vector_stores.py
@ -82,23 +82,37 @@ def skip_if_provider_doesnt_support_openai_vector_stores_search(client_with_mode

@pytest.fixture(scope="session")
 def sample_chunks():
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
+    chunks_data = [
+        (
+            "Python is a high-level programming language that emphasizes code readability and allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java.",
+            "doc1",
+            "programming",
+        ),
+        (
+            "Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed, using statistical techniques to give computer systems the ability to progressively improve performance on a specific task.",
+            "doc2",
+            "ai",
+        ),
+        (
+            "Data structures are fundamental to computer science because they provide organized ways to store and access data efficiently, enable faster processing of data through optimized algorithms, and form the building blocks for more complex software systems.",
+            "doc3",
+            "computer_science",
+        ),
+        (
+            "Neural networks are inspired by biological neural networks found in animal brains, using interconnected nodes called artificial neurons to process information through weighted connections that can be trained to recognize patterns and solve complex problems through iterative learning.",
+            "doc4",
+            "ai",
+        ),
+    ]
    return [
        Chunk(
-            content="Python is a high-level programming language that emphasizes code readability and allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java.",
-            metadata={"document_id": "doc1", "topic": "programming"},
-        ),
-        Chunk(
-            content="Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed, using statistical techniques to give computer systems the ability to progressively improve performance on a specific task.",
-            metadata={"document_id": "doc2", "topic": "ai"},
-        ),
-        Chunk(
-            content="Data structures are fundamental to computer science because they provide organized ways to store and access data efficiently, enable faster processing of data through optimized algorithms, and form the building blocks for more complex software systems.",
-            metadata={"document_id": "doc3", "topic": "computer_science"},
-        ),
-        Chunk(
-            content="Neural networks are inspired by biological neural networks found in animal brains, using interconnected nodes called artificial neurons to process information through weighted connections that can be trained to recognize patterns and solve complex problems through iterative learning.",
-            metadata={"document_id": "doc4", "topic": "ai"},
-        ),
+            content=content,
+            chunk_id=generate_chunk_id(doc_id, content),
+            metadata={"document_id": doc_id, "topic": topic},
+        )
+        for content, doc_id, topic in chunks_data
    ]


--- a/tests/integration/vector_io/test_vector_io.py
+++ b/tests/integration/vector_io/test_vector_io.py
@ -13,23 +13,33 @@ from ..conftest import vector_provider_wrapper

@pytest.fixture(scope="session")
 def sample_chunks():
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
+    chunks_data = [
+        (
+            "Python is a high-level programming language that emphasizes code readability and allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java.",
+            "doc1",
+        ),
+        (
+            "Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed, using statistical techniques to give computer systems the ability to progressively improve performance on a specific task.",
+            "doc2",
+        ),
+        (
+            "Data structures are fundamental to computer science because they provide organized ways to store and access data efficiently, enable faster processing of data through optimized algorithms, and form the building blocks for more complex software systems.",
+            "doc3",
+        ),
+        (
+            "Neural networks are inspired by biological neural networks found in animal brains, using interconnected nodes called artificial neurons to process information through weighted connections that can be trained to recognize patterns and solve complex problems through iterative learning.",
+            "doc4",
+        ),
+    ]
    return [
        Chunk(
-            content="Python is a high-level programming language that emphasizes code readability and allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java.",
-            metadata={"document_id": "doc1"},
-        ),
-        Chunk(
-            content="Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed, using statistical techniques to give computer systems the ability to progressively improve performance on a specific task.",
-            metadata={"document_id": "doc2"},
-        ),
-        Chunk(
-            content="Data structures are fundamental to computer science because they provide organized ways to store and access data efficiently, enable faster processing of data through optimized algorithms, and form the building blocks for more complex software systems.",
-            metadata={"document_id": "doc3"},
-        ),
-        Chunk(
-            content="Neural networks are inspired by biological neural networks found in animal brains, using interconnected nodes called artificial neurons to process information through weighted connections that can be trained to recognize patterns and solve complex problems through iterative learning.",
-            metadata={"document_id": "doc4"},
-        ),
+            content=content,
+            chunk_id=generate_chunk_id(doc_id, content),
+            metadata={"document_id": doc_id},
+        )
+        for content, doc_id in chunks_data
    ]


@ -168,6 +178,7 @@ def test_insert_chunks_with_precomputed_embeddings(
    chunks_with_embeddings = [
        Chunk(
            content="This is a test chunk with precomputed embedding.",
+            chunk_id="chunk1",
            metadata={"document_id": "doc1", "source": "precomputed", "chunk_id": "chunk1"},
            embedding=[0.1] * int(embedding_dimension),
        ),
@ -215,9 +226,12 @@ def test_query_returns_valid_object_when_identical_to_embedding_in_vdb(

    actual_vector_store_id = register_response.id

+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
    chunks_with_embeddings = [
        Chunk(
            content="duplicate",
+            chunk_id=generate_chunk_id("doc1", "duplicate"),
            metadata={"document_id": "doc1", "source": "precomputed"},
            embedding=[0.1] * int(embedding_dimension),
        ),
--- a/tests/unit/providers/agent/test_meta_reference_agent.py
+++ b/tests/unit/providers/agent/test_meta_reference_agent.py
@ -192,18 +192,18 @@ async def test_create_agent_session_persistence(agents_impl, sample_agent_config
    assert session_response.session_id is not None

    # Verify the session was stored
-    session = await agents_impl.get_agents_session(agent_id, session_response.session_id)
+    session = await agents_impl.get_agents_session(session_response.session_id, agent_id)
    assert session.session_name == "test_session"
    assert session.session_id == session_response.session_id
    assert session.started_at is not None
    assert session.turns == []

    # Delete the session
-    await agents_impl.delete_agents_session(agent_id, session_response.session_id)
+    await agents_impl.delete_agents_session(session_response.session_id, agent_id)

    # Verify the session was deleted
    with pytest.raises(ValueError):
-        await agents_impl.get_agents_session(agent_id, session_response.session_id)
+        await agents_impl.get_agents_session(session_response.session_id, agent_id)


@pytest.mark.parametrize("enable_session_persistence", [True, False])
@ -226,11 +226,11 @@ async def test_list_agent_sessions_persistence(agents_impl, sample_agent_config,
    assert session2.session_id in session_ids

    # Delete one session
-    await agents_impl.delete_agents_session(agent_id, session1.session_id)
+    await agents_impl.delete_agents_session(session1.session_id, agent_id)

    # Verify the session was deleted
    with pytest.raises(ValueError):
-        await agents_impl.get_agents_session(agent_id, session1.session_id)
+        await agents_impl.get_agents_session(session1.session_id, agent_id)

    # List sessions again
    sessions = await agents_impl.list_agent_sessions(agent_id)
--- a/tests/unit/providers/vector_io/conftest.py
+++ b/tests/unit/providers/vector_io/conftest.py
@ -43,9 +43,15 @@ def embedding_dimension() -> int:
@pytest.fixture(scope="session")
 def sample_chunks():
    """Generates chunks that force multiple batches for a single document to expose ID conflicts."""
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
    n, k = 10, 3
    sample = [
-        Chunk(content=f"Sentence {i} from document {j}", metadata={"document_id": f"document-{j}"})
+        Chunk(
+            content=f"Sentence {i} from document {j}",
+            chunk_id=generate_chunk_id(f"document-{j}", f"Sentence {i} from document {j}"),
+            metadata={"document_id": f"document-{j}"},
+        )
        for j in range(k)
        for i in range(n)
    ]
@ -53,6 +59,7 @@ def sample_chunks():
        [
            Chunk(
                content=f"Sentence {i} from document {j + k}",
+                chunk_id=f"document-{j}-chunk-{i}",
                chunk_metadata=ChunkMetadata(
                    document_id=f"document-{j + k}",
                    chunk_id=f"document-{j}-chunk-{i}",
@ -73,6 +80,7 @@ def sample_chunks_with_metadata():
    sample = [
        Chunk(
            content=f"Sentence {i} from document {j}",
+            chunk_id=f"document-{j}-chunk-{i}",
            metadata={"document_id": f"document-{j}"},
            chunk_metadata=ChunkMetadata(
                document_id=f"document-{j}",
--- a/tests/unit/providers/vector_io/test_faiss.py
+++ b/tests/unit/providers/vector_io/test_faiss.py
@ -49,9 +49,21 @@ def vector_store_id():

@pytest.fixture
 def sample_chunks():
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
    return [
-        Chunk(content="MOCK text content 1", mime_type="text/plain", metadata={"document_id": "mock-doc-1"}),
-        Chunk(content="MOCK text content 1", mime_type="text/plain", metadata={"document_id": "mock-doc-2"}),
+        Chunk(
+            content="MOCK text content 1",
+            chunk_id=generate_chunk_id("mock-doc-1", "MOCK text content 1"),
+            mime_type="text/plain",
+            metadata={"document_id": "mock-doc-1"},
+        ),
+        Chunk(
+            content="MOCK text content 1",
+            chunk_id=generate_chunk_id("mock-doc-2", "MOCK text content 1"),
+            mime_type="text/plain",
+            metadata={"document_id": "mock-doc-2"},
+        ),
    ]


--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@ -434,9 +434,15 @@ async def test_query_chunks_hybrid_tie_breaking(
    sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
 ):
    """Test tie-breaking and determinism when scores are equal."""
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
    # Create two chunks with the same content and embedding
-    chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
-    chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
+    chunk1 = Chunk(
+        content="identical", chunk_id=generate_chunk_id("docA", "identical"), metadata={"document_id": "docA"}
+    )
+    chunk2 = Chunk(
+        content="identical", chunk_id=generate_chunk_id("docB", "identical"), metadata={"document_id": "docB"}
+    )
    chunks = [chunk1, chunk2]
    # Use the same embedding for both chunks to ensure equal scores
    same_embedding = sample_embeddings[0]
--- a/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
+++ b/tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py
@ -135,10 +135,24 @@ async def test_insert_chunks_with_missing_document_id(vector_io_adapter):
    vector_io_adapter.cache["db1"] = fake_index

    # Various document_id scenarios that shouldn't crash
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
    chunks = [
-        Chunk(content="has doc_id in metadata", metadata={"document_id": "doc-1"}),
-        Chunk(content="no doc_id anywhere", metadata={"source": "test"}),
-        Chunk(content="doc_id in chunk_metadata", chunk_metadata=ChunkMetadata(document_id="doc-3")),
+        Chunk(
+            content="has doc_id in metadata",
+            chunk_id=generate_chunk_id("doc-1", "has doc_id in metadata"),
+            metadata={"document_id": "doc-1"},
+        ),
+        Chunk(
+            content="no doc_id anywhere",
+            chunk_id=generate_chunk_id("unknown", "no doc_id anywhere"),
+            metadata={"source": "test"},
+        ),
+        Chunk(
+            content="doc_id in chunk_metadata",
+            chunk_id=generate_chunk_id("doc-3", "doc_id in chunk_metadata"),
+            chunk_metadata=ChunkMetadata(document_id="doc-3"),
+        ),
    ]

    # Should work without KeyError
@ -151,7 +165,9 @@ async def test_document_id_with_invalid_type_raises_error():
    from llama_stack.apis.vector_io import Chunk

    # Integer document_id should raise TypeError
-    chunk = Chunk(content="test", metadata={"document_id": 12345})
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
+    chunk = Chunk(content="test", chunk_id=generate_chunk_id("test", "test"), metadata={"document_id": 12345})
    with pytest.raises(TypeError) as exc_info:
        _ = chunk.document_id
    assert "metadata['document_id'] must be a string" in str(exc_info.value)
@ -159,7 +175,9 @@ async def test_document_id_with_invalid_type_raises_error():


 async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
-    expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
+    from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
+
+    expected = QueryChunksResponse(chunks=[Chunk(content="c1", chunk_id=generate_chunk_id("test", "c1"))], scores=[0.1])
    fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))
    vector_io_adapter.cache["db1"] = fake_index

--- a/tests/unit/providers/vector_io/test_vector_utils.py
+++ b/tests/unit/providers/vector_io/test_vector_utils.py
@ -18,13 +18,12 @@ from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id


 def test_generate_chunk_id():
-    chunks = [
-        Chunk(content="test", metadata={"document_id": "doc-1"}),
-        Chunk(content="test ", metadata={"document_id": "doc-1"}),
-        Chunk(content="test 3", metadata={"document_id": "doc-1"}),
-    ]
+    """Test that generate_chunk_id produces expected hashes."""
+    chunk_id1 = generate_chunk_id("doc-1", "test")
+    chunk_id2 = generate_chunk_id("doc-1", "test ")
+    chunk_id3 = generate_chunk_id("doc-1", "test 3")

-    chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
+    chunk_ids = sorted([chunk_id1, chunk_id2, chunk_id3])
    assert chunk_ids == [
        "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
        "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
@ -33,42 +32,49 @@ def test_generate_chunk_id():


 def test_generate_chunk_id_with_window():
-    chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
+    """Test that generate_chunk_id with chunk_window produces different IDs."""
+    # Create a chunk object to match the original test behavior (passing object to generate_chunk_id)
+    chunk = Chunk(content="test", chunk_id="placeholder", metadata={"document_id": "doc-1"})
    chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
    chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
-    assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
-    assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
+    # Verify that different windows produce different IDs
+    assert chunk_id1 != chunk_id2
+    assert len(chunk_id1) == 36  # Valid UUID format
+    assert len(chunk_id2) == 36  # Valid UUID format


-def test_chunk_id():
-    # Test with existing chunk ID
-    chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
-    assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
-
-    # Test with document ID in metadata
-    chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
-    assert chunk_with_doc_id.chunk_id == generate_chunk_id("doc-1", "test")
-
-    # Test chunks with ChunkMetadata
-    chunk_with_metadata = Chunk(
+def test_chunk_creation_with_explicit_id():
+    """Test that chunks can be created with explicit chunk_id."""
+    chunk_id = generate_chunk_id("doc-1", "test")
+    chunk = Chunk(
        content="test",
-        metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"},
+        chunk_id=chunk_id,
+        metadata={"document_id": "doc-1"},
+    )
+    assert chunk.chunk_id == chunk_id
+    assert chunk.chunk_id == "31d1f9a3-c8d2-66e7-3c37-af2acd329778"
+
+
+def test_chunk_with_metadata():
+    """Test chunks with ChunkMetadata."""
+    chunk_id = "chunk-id-1"
+    chunk = Chunk(
+        content="test",
+        chunk_id=chunk_id,
+        metadata={"document_id": "existing-id"},
        chunk_metadata=ChunkMetadata(document_id="document_1"),
    )
-    assert chunk_with_metadata.chunk_id == "chunk-id-1"
-
-    # Test with no ID or document ID
-    chunk_without_id = Chunk(content="test")
-    generated_id = chunk_without_id.chunk_id
-    assert isinstance(generated_id, str) and len(generated_id) == 36  # Should be a valid UUID
+    assert chunk.chunk_id == "chunk-id-1"
+    assert chunk.document_id == "existing-id"  # metadata takes precedence


-def test_stored_chunk_id_alias():
-    # Test with existing chunk ID alias
-    chunk_with_alias = Chunk(content="test", metadata={"document_id": "existing-id", "chunk_id": "chunk-id-1"})
-    assert chunk_with_alias.chunk_id == "chunk-id-1"
-    serialized_chunk = chunk_with_alias.model_dump()
-    assert serialized_chunk["stored_chunk_id"] == "chunk-id-1"
-    # showing chunk_id is not serialized (i.e., a computed field)
-    assert "chunk_id" not in serialized_chunk
-    assert chunk_with_alias.stored_chunk_id == "chunk-id-1"
+def test_chunk_serialization():
+    """Test that chunk_id is properly serialized."""
+    chunk = Chunk(
+        content="test",
+        chunk_id="test-chunk-id",
+        metadata={"document_id": "doc-1"},
+    )
+    serialized_chunk = chunk.model_dump()
+    assert serialized_chunk["chunk_id"] == "test-chunk-id"
+    assert "chunk_id" in serialized_chunk
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@ -41,6 +41,7 @@ class TestRagQuery:
        interleaved_content = MagicMock()
        chunk = Chunk(
            content=interleaved_content,
+            chunk_id="chunk1",
            metadata={
                "key1": "value1",
                "token_count": 10,
@ -48,7 +49,6 @@ class TestRagQuery:
                # Note this is inserted into `metadata` during MemoryToolRuntimeImpl().insert()
                "document_id": "doc1",
            },
-            stored_chunk_id="chunk1",
            chunk_metadata=chunk_metadata,
        )

@ -101,8 +101,8 @@ class TestRagQuery:
        )
        chunk1 = Chunk(
            content="chunk from db1",
+            chunk_id="c1",
            metadata={"vector_store_id": "db1", "document_id": "doc1"},
-            stored_chunk_id="c1",
            chunk_metadata=chunk_metadata1,
        )

@ -114,8 +114,8 @@ class TestRagQuery:
        )
        chunk2 = Chunk(
            content="chunk from db2",
+            chunk_id="c2",
            metadata={"vector_store_id": "db2", "document_id": "doc2"},
-            stored_chunk_id="c2",
            chunk_metadata=chunk_metadata2,
        )

--- a/tests/unit/rag/test_vector_store.py
+++ b/tests/unit/rag/test_vector_store.py
@ -26,6 +26,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
    make_overlapped_chunks,
 )
+from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id

 DUMMY_PDF_PATH = Path(os.path.abspath(__file__)).parent / "fixtures" / "dummy.pdf"
 # Depending on the machine, this can get parsed a couple of ways
@ -53,6 +54,7 @@ class TestChunk:
    def test_chunk(self):
        chunk = Chunk(
            content="Example chunk content",
+            chunk_id=generate_chunk_id("test-doc", "Example chunk content"),
            metadata={"key": "value"},
            embedding=[0.1, 0.2, 0.3],
        )
@ -63,6 +65,7 @@ class TestChunk:

        chunk_no_embedding = Chunk(
            content="Example chunk content",
+            chunk_id=generate_chunk_id("test-doc", "Example chunk content"),
            metadata={"key": "value"},
        )
        assert chunk_no_embedding.embedding is None
@ -218,8 +221,8 @@ class TestVectorStoreWithIndex:
        )

        chunks = [
-            Chunk(content="Test 1", embedding=None, metadata={}),
-            Chunk(content="Test 2", embedding=None, metadata={}),
+            Chunk(content="Test 1", chunk_id=generate_chunk_id("test-doc", "Test 1"), embedding=None, metadata={}),
+            Chunk(content="Test 2", chunk_id=generate_chunk_id("test-doc", "Test 2"), embedding=None, metadata={}),
        ]

        mock_inference_api.openai_embeddings.return_value.data = [
@ -254,8 +257,18 @@ class TestVectorStoreWithIndex:
        )

        chunks = [
-            Chunk(content="Test 1", embedding=[0.1, 0.2, 0.3], metadata={}),
-            Chunk(content="Test 2", embedding=[0.4, 0.5, 0.6], metadata={}),
+            Chunk(
+                content="Test 1",
+                chunk_id=generate_chunk_id("test-doc", "Test 1"),
+                embedding=[0.1, 0.2, 0.3],
+                metadata={},
+            ),
+            Chunk(
+                content="Test 2",
+                chunk_id=generate_chunk_id("test-doc", "Test 2"),
+                embedding=[0.4, 0.5, 0.6],
+                metadata={},
+            ),
        ]

        await vector_store_with_index.insert_chunks(chunks)
@ -279,25 +292,47 @@ class TestVectorStoreWithIndex:

        # Verify Chunk raises ValueError for invalid embedding type
        with pytest.raises(ValueError, match="Input should be a valid list"):
-            Chunk(content="Test 1", embedding="invalid_type", metadata={})
+            Chunk(
+                content="Test 1",
+                chunk_id=generate_chunk_id("test-doc", "Test 1"),
+                embedding="invalid_type",
+                metadata={},
+            )

        # Verify Chunk raises ValueError for invalid embedding type in insert_chunks (i.e., Chunk errors before insert_chunks is called)
        with pytest.raises(ValueError, match="Input should be a valid list"):
            await vector_store_with_index.insert_chunks(
                [
-                    Chunk(content="Test 1", embedding=None, metadata={}),
-                    Chunk(content="Test 2", embedding="invalid_type", metadata={}),
+                    Chunk(
+                        content="Test 1", chunk_id=generate_chunk_id("test-doc", "Test 1"), embedding=None, metadata={}
+                    ),
+                    Chunk(
+                        content="Test 2",
+                        chunk_id=generate_chunk_id("test-doc", "Test 2"),
+                        embedding="invalid_type",
+                        metadata={},
+                    ),
                ]
            )

        # Verify Chunk raises ValueError for invalid embedding element type in insert_chunks (i.e., Chunk errors before insert_chunks is called)
        with pytest.raises(ValueError, match=" Input should be a valid number, unable to parse string as a number "):
            await vector_store_with_index.insert_chunks(
-                Chunk(content="Test 1", embedding=[0.1, "string", 0.3], metadata={})
+                Chunk(
+                    content="Test 1",
+                    chunk_id=generate_chunk_id("test-doc", "Test 1"),
+                    embedding=[0.1, "string", 0.3],
+                    metadata={},
+                )
            )

        chunks_wrong_dim = [
-            Chunk(content="Test 1", embedding=[0.1, 0.2, 0.3, 0.4], metadata={}),
+            Chunk(
+                content="Test 1",
+                chunk_id=generate_chunk_id("test-doc", "Test 1"),
+                embedding=[0.1, 0.2, 0.3, 0.4],
+                metadata={},
+            ),
        ]
        with pytest.raises(ValueError, match="has dimension 4, expected 3"):
            await vector_store_with_index.insert_chunks(chunks_wrong_dim)
@ -317,9 +352,14 @@ class TestVectorStoreWithIndex:
        )

        chunks = [
-            Chunk(content="Test 1", embedding=None, metadata={}),
-            Chunk(content="Test 2", embedding=[0.2, 0.2, 0.2], metadata={}),
-            Chunk(content="Test 3", embedding=None, metadata={}),
+            Chunk(content="Test 1", chunk_id=generate_chunk_id("test-doc", "Test 1"), embedding=None, metadata={}),
+            Chunk(
+                content="Test 2",
+                chunk_id=generate_chunk_id("test-doc", "Test 2"),
+                embedding=[0.2, 0.2, 0.2],
+                metadata={},
+            ),
+            Chunk(content="Test 3", chunk_id=generate_chunk_id("test-doc", "Test 3"), embedding=None, metadata={}),
        ]

        mock_inference_api.openai_embeddings.return_value.data = [