Merge branch 'main' into watsonx_hc

2025-12-27 15:12:00 +00:00 · 2025-06-16 14:44:46 +05:30 · 2025-06-16 14:44:46 +05:30 · f5388e252d
commit f5388e252d
parent f63520630e 985d0b156c
48 changed files with 2179 additions and 66 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -3240,6 +3240,59 @@
                }
            }
        },
+        "/v1/openai/v1/vector_stores/{vector_store_id}/files": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "A VectorStoreFileObject representing the attached file.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/VectorStoreFileObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "VectorIO"
+                ],
+                "description": "Attach a file to a vector store.",
+                "parameters": [
+                    {
+                        "name": "vector_store_id",
+                        "in": "path",
+                        "description": "The ID of the vector store to attach the file to.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/OpenaiAttachFileToVectorStoreRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
        "/v1/openai/v1/completions": {
            "post": {
                "responses": {
@ -7047,6 +7100,9 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
@ -7193,12 +7249,41 @@
                        "const": "file_search",
                        "default": "file_search"
                    },
-                    "vector_store_id": {
+                    "vector_store_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    },
+                    "filters": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    },
+                    "max_num_results": {
+                        "type": "integer",
+                        "default": 10
+                    },
                    "ranking_options": {
                        "type": "object",
                        "properties": {
@ -7217,7 +7302,7 @@
                "additionalProperties": false,
                "required": [
                    "type",
-                    "vector_store_id"
+                    "vector_store_ids"
                ],
                "title": "OpenAIResponseInputToolFileSearch"
            },
@ -7484,6 +7569,64 @@
                ],
                "title": "OpenAIResponseOutputMessageContentOutputText"
            },
+            "OpenAIResponseOutputMessageFileSearchToolCall": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "queries": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "file_search_call",
+                        "default": "file_search_call"
+                    },
+                    "results": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "additionalProperties": {
+                                "oneOf": [
+                                    {
+                                        "type": "null"
+                                    },
+                                    {
+                                        "type": "boolean"
+                                    },
+                                    {
+                                        "type": "number"
+                                    },
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array"
+                                    },
+                                    {
+                                        "type": "object"
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "queries",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageFileSearchToolCall"
+            },
            "OpenAIResponseOutputMessageFunctionToolCall": {
                "type": "object",
                "properties": {
@ -7760,6 +7903,9 @@
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
@ -7775,6 +7921,7 @@
                    "mapping": {
                        "message": "#/components/schemas/OpenAIResponseMessage",
                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
+                        "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
                        "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
                        "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
                        "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
@ -11766,6 +11913,232 @@
                ],
                "title": "LogEventRequest"
            },
+            "VectorStoreChunkingStrategy": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/VectorStoreChunkingStrategyAuto"
+                    },
+                    {
+                        "$ref": "#/components/schemas/VectorStoreChunkingStrategyStatic"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "auto": "#/components/schemas/VectorStoreChunkingStrategyAuto",
+                        "static": "#/components/schemas/VectorStoreChunkingStrategyStatic"
+                    }
+                }
+            },
+            "VectorStoreChunkingStrategyAuto": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "auto",
+                        "default": "auto"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "VectorStoreChunkingStrategyAuto"
+            },
+            "VectorStoreChunkingStrategyStatic": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "static",
+                        "default": "static"
+                    },
+                    "static": {
+                        "$ref": "#/components/schemas/VectorStoreChunkingStrategyStaticConfig"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "static"
+                ],
+                "title": "VectorStoreChunkingStrategyStatic"
+            },
+            "VectorStoreChunkingStrategyStaticConfig": {
+                "type": "object",
+                "properties": {
+                    "chunk_overlap_tokens": {
+                        "type": "integer",
+                        "default": 400
+                    },
+                    "max_chunk_size_tokens": {
+                        "type": "integer",
+                        "default": 800
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "chunk_overlap_tokens",
+                    "max_chunk_size_tokens"
+                ],
+                "title": "VectorStoreChunkingStrategyStaticConfig"
+            },
+            "OpenaiAttachFileToVectorStoreRequest": {
+                "type": "object",
+                "properties": {
+                    "file_id": {
+                        "type": "string",
+                        "description": "The ID of the file to attach to the vector store."
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        },
+                        "description": "The key-value attributes stored with the file, which can be used for filtering."
+                    },
+                    "chunking_strategy": {
+                        "$ref": "#/components/schemas/VectorStoreChunkingStrategy",
+                        "description": "The chunking strategy to use for the file."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "file_id"
+                ],
+                "title": "OpenaiAttachFileToVectorStoreRequest"
+            },
+            "VectorStoreFileLastError": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "server_error"
+                            },
+                            {
+                                "type": "string",
+                                "const": "rate_limit_exceeded"
+                            }
+                        ]
+                    },
+                    "message": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "code",
+                    "message"
+                ],
+                "title": "VectorStoreFileLastError"
+            },
+            "VectorStoreFileObject": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "default": "vector_store.file"
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    },
+                    "chunking_strategy": {
+                        "$ref": "#/components/schemas/VectorStoreChunkingStrategy"
+                    },
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "last_error": {
+                        "$ref": "#/components/schemas/VectorStoreFileLastError"
+                    },
+                    "status": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "completed"
+                            },
+                            {
+                                "type": "string",
+                                "const": "in_progress"
+                            },
+                            {
+                                "type": "string",
+                                "const": "cancelled"
+                            },
+                            {
+                                "type": "string",
+                                "const": "failed"
+                            }
+                        ]
+                    },
+                    "usage_bytes": {
+                        "type": "integer",
+                        "default": 0
+                    },
+                    "vector_store_id": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "object",
+                    "attributes",
+                    "chunking_strategy",
+                    "created_at",
+                    "status",
+                    "usage_bytes",
+                    "vector_store_id"
+                ],
+                "title": "VectorStoreFileObject",
+                "description": "OpenAI Vector Store File object."
+            },
            "OpenAIJSONSchema": {
                "type": "object",
                "properties": {
@ -12404,6 +12777,10 @@
                    },
                    "prompt_logprobs": {
                        "type": "integer"
+                    },
+                    "suffix": {
+                        "type": "string",
+                        "description": "(Optional) The suffix that should be appended to the completion."
                    }
                },
                "additionalProperties": false,
@ -13621,7 +13998,11 @@
                    },
                    "mode": {
                        "type": "string",
-                        "description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"."
+                        "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
+                    },
+                    "ranker": {
+                        "$ref": "#/components/schemas/Ranker",
+                        "description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
                    }
                },
                "additionalProperties": false,
@ -13651,6 +14032,69 @@
                    }
                }
            },
+            "RRFRanker": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "rrf",
+                        "default": "rrf",
+                        "description": "The type of ranker, always \"rrf\""
+                    },
+                    "impact_factor": {
+                        "type": "number",
+                        "default": 60.0,
+                        "description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009)."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "impact_factor"
+                ],
+                "title": "RRFRanker",
+                "description": "Reciprocal Rank Fusion (RRF) ranker configuration."
+            },
+            "Ranker": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/RRFRanker"
+                    },
+                    {
+                        "$ref": "#/components/schemas/WeightedRanker"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "rrf": "#/components/schemas/RRFRanker",
+                        "weighted": "#/components/schemas/WeightedRanker"
+                    }
+                }
+            },
+            "WeightedRanker": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "weighted",
+                        "default": "weighted",
+                        "description": "The type of ranker, always \"weighted\""
+                    },
+                    "alpha": {
+                        "type": "number",
+                        "default": 0.5,
+                        "description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "alpha"
+                ],
+                "title": "WeightedRanker",
+                "description": "Weighted ranker configuration that combines vector and keyword scores."
+            },
            "QueryRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2263,6 +2263,43 @@ paths:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
+  /v1/openai/v1/vector_stores/{vector_store_id}/files:
+    post:
+      responses:
+        '200':
+          description: >-
+            A VectorStoreFileObject representing the attached file.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/VectorStoreFileObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - VectorIO
+      description: Attach a file to a vector store.
+      parameters:
+        - name: vector_store_id
+          in: path
+          description: >-
+            The ID of the vector store to attach the file to.
+          required: true
+          schema:
+            type: string
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/OpenaiAttachFileToVectorStoreRequest'
+        required: true
  /v1/openai/v1/completions:
    post:
      responses:
@ -5021,6 +5058,7 @@ components:
    OpenAIResponseInput:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
@ -5115,10 +5153,23 @@ components:
          type: string
          const: file_search
          default: file_search
-        vector_store_id:
+        vector_store_ids:
          type: array
          items:
            type: string
+        filters:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        max_num_results:
+          type: integer
+          default: 10
        ranking_options:
          type: object
          properties:
@ -5132,7 +5183,7 @@ components:
      additionalProperties: false
      required:
        - type
-        - vector_store_id
+        - vector_store_ids
      title: OpenAIResponseInputToolFileSearch
    OpenAIResponseInputToolFunction:
      type: object
@ -5294,6 +5345,41 @@ components:
        - type
      title: >-
        OpenAIResponseOutputMessageContentOutputText
+    "OpenAIResponseOutputMessageFileSearchToolCall":
+      type: object
+      properties:
+        id:
+          type: string
+        queries:
+          type: array
+          items:
+            type: string
+        status:
+          type: string
+        type:
+          type: string
+          const: file_search_call
+          default: file_search_call
+        results:
+          type: array
+          items:
+            type: object
+            additionalProperties:
+              oneOf:
+                - type: 'null'
+                - type: boolean
+                - type: number
+                - type: string
+                - type: array
+                - type: object
+      additionalProperties: false
+      required:
+        - id
+        - queries
+        - status
+        - type
+      title: >-
+        OpenAIResponseOutputMessageFileSearchToolCall
    "OpenAIResponseOutputMessageFunctionToolCall":
      type: object
      properties:
@ -5491,6 +5577,7 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseMessage'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@ -5499,6 +5586,7 @@ components:
        mapping:
          message: '#/components/schemas/OpenAIResponseMessage'
          web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+          file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
          function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
          mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@ -8251,6 +8339,148 @@ components:
        - event
        - ttl_seconds
      title: LogEventRequest
+    VectorStoreChunkingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+      discriminator:
+        propertyName: type
+        mapping:
+          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+    VectorStoreChunkingStrategyAuto:
+      type: object
+      properties:
+        type:
+          type: string
+          const: auto
+          default: auto
+      additionalProperties: false
+      required:
+        - type
+      title: VectorStoreChunkingStrategyAuto
+    VectorStoreChunkingStrategyStatic:
+      type: object
+      properties:
+        type:
+          type: string
+          const: static
+          default: static
+        static:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
+      additionalProperties: false
+      required:
+        - type
+        - static
+      title: VectorStoreChunkingStrategyStatic
+    VectorStoreChunkingStrategyStaticConfig:
+      type: object
+      properties:
+        chunk_overlap_tokens:
+          type: integer
+          default: 400
+        max_chunk_size_tokens:
+          type: integer
+          default: 800
+      additionalProperties: false
+      required:
+        - chunk_overlap_tokens
+        - max_chunk_size_tokens
+      title: VectorStoreChunkingStrategyStaticConfig
+    OpenaiAttachFileToVectorStoreRequest:
+      type: object
+      properties:
+        file_id:
+          type: string
+          description: >-
+            The ID of the file to attach to the vector store.
+        attributes:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: >-
+            The key-value attributes stored with the file, which can be used for filtering.
+        chunking_strategy:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
+          description: >-
+            The chunking strategy to use for the file.
+      additionalProperties: false
+      required:
+        - file_id
+      title: OpenaiAttachFileToVectorStoreRequest
+    VectorStoreFileLastError:
+      type: object
+      properties:
+        code:
+          oneOf:
+            - type: string
+              const: server_error
+            - type: string
+              const: rate_limit_exceeded
+        message:
+          type: string
+      additionalProperties: false
+      required:
+        - code
+        - message
+      title: VectorStoreFileLastError
+    VectorStoreFileObject:
+      type: object
+      properties:
+        id:
+          type: string
+        object:
+          type: string
+          default: vector_store.file
+        attributes:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+        chunking_strategy:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
+        created_at:
+          type: integer
+        last_error:
+          $ref: '#/components/schemas/VectorStoreFileLastError'
+        status:
+          oneOf:
+            - type: string
+              const: completed
+            - type: string
+              const: in_progress
+            - type: string
+              const: cancelled
+            - type: string
+              const: failed
+        usage_bytes:
+          type: integer
+          default: 0
+        vector_store_id:
+          type: string
+      additionalProperties: false
+      required:
+        - id
+        - object
+        - attributes
+        - chunking_strategy
+        - created_at
+        - status
+        - usage_bytes
+        - vector_store_id
+      title: VectorStoreFileObject
+      description: OpenAI Vector Store File object.
    OpenAIJSONSchema:
      type: object
      properties:
@ -8673,6 +8903,10 @@ components:
            type: string
        prompt_logprobs:
          type: integer
+        suffix:
+          type: string
+          description: >-
+            (Optional) The suffix that should be appended to the completion.
      additionalProperties: false
      required:
        - model
@ -9526,7 +9760,13 @@ components:
        mode:
          type: string
          description: >-
-            Search mode for retrieval—either "vector" or "keyword". Default "vector".
+            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
+            "vector".
+        ranker:
+          $ref: '#/components/schemas/Ranker'
+          description: >-
+            Configuration for the ranker to use in hybrid search. Defaults to RRF
+            ranker.
      additionalProperties: false
      required:
        - query_generator_config
@ -9545,6 +9785,58 @@ components:
        mapping:
          default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
          llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
+    RRFRanker:
+      type: object
+      properties:
+        type:
+          type: string
+          const: rrf
+          default: rrf
+          description: The type of ranker, always "rrf"
+        impact_factor:
+          type: number
+          default: 60.0
+          description: >-
+            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
+            results. Must be greater than 0. Default of 60 is from the original RRF
+            paper (Cormack et al., 2009).
+      additionalProperties: false
+      required:
+        - type
+        - impact_factor
+      title: RRFRanker
+      description: >-
+        Reciprocal Rank Fusion (RRF) ranker configuration.
+    Ranker:
+      oneOf:
+        - $ref: '#/components/schemas/RRFRanker'
+        - $ref: '#/components/schemas/WeightedRanker'
+      discriminator:
+        propertyName: type
+        mapping:
+          rrf: '#/components/schemas/RRFRanker'
+          weighted: '#/components/schemas/WeightedRanker'
+    WeightedRanker:
+      type: object
+      properties:
+        type:
+          type: string
+          const: weighted
+          default: weighted
+          description: The type of ranker, always "weighted"
+        alpha:
+          type: number
+          default: 0.5
+          description: >-
+            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
+            only use vector scores, values in between blend both scores.
+      additionalProperties: false
+      required:
+        - type
+        - alpha
+      title: WeightedRanker
+      description: >-
+        Weighted ranker configuration that combines vector and keyword scores.
    QueryRequest:
      type: object
      properties:
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -18,6 +18,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | agents | `inline::meta-reference` |
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
+| files | `inline::localfs` |
 | inference | `remote::ollama` |
 | post_training | `inline::huggingface` |
 | safety | `inline::llama-guard` |
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@ -66,25 +66,126 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use SQLite-Vec.
 3. Start storing and querying vectors.

-## Supported Search Modes
+The SQLite-vec provider supports three search modes:

-The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
-
-When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
-`RAGQueryConfig`. For example:
+1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
+2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
+3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.

+Example with hybrid search:
 ```python
-from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
+response = await vector_io.query_chunks(
+    vector_db_id="my_db",
+    query="your query here",
+    params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
+)

-query_config = RAGQueryConfig(max_chunks=6, mode="vector")
+# Using RRF ranker
+response = await vector_io.query_chunks(
+    vector_db_id="my_db",
+    query="your query here",
+    params={
+        "mode": "hybrid",
+        "max_chunks": 3,
+        "score_threshold": 0.7,
+        "ranker": {"type": "rrf", "impact_factor": 60.0},
+    },
+)

-results = client.tool_runtime.rag_tool.query(
-    vector_db_ids=[vector_db_id],
-    content="what is torchtune",
-    query_config=query_config,
+# Using weighted ranker
+response = await vector_io.query_chunks(
+    vector_db_id="my_db",
+    query="your query here",
+    params={
+        "mode": "hybrid",
+        "max_chunks": 3,
+        "score_threshold": 0.7,
+        "ranker": {"type": "weighted", "alpha": 0.7},  # 70% vector, 30% keyword
+    },
 )
 ```

+Example with explicit vector search:
+```python
+response = await vector_io.query_chunks(
+    vector_db_id="my_db",
+    query="your query here",
+    params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
+)
+```
+
+Example with keyword search:
+```python
+response = await vector_io.query_chunks(
+    vector_db_id="my_db",
+    query="your query here",
+    params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
+)
+```
+
+## Supported Search Modes
+
+The SQLite vector store supports three search modes:
+
+1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
+2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
+3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
+
+### Hybrid Search
+
+Hybrid search combines the strengths of both vector and keyword search by:
+- Computing vector similarity scores
+- Computing keyword match scores
+- Using a ranker to combine these scores
+
+Two ranker types are supported:
+
+1. **RRF (Reciprocal Rank Fusion)**:
+   - Combines ranks from both vector and keyword results
+   - Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
+   - Good for balancing between vector and keyword results
+   - The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
+
+2. **Weighted**:
+   - Linearly combines normalized vector and keyword scores
+   - Uses an alpha parameter (0-1) to control the blend:
+     - alpha=0: Only use keyword scores
+     - alpha=1: Only use vector scores
+     - alpha=0.5: Equal weight to both (default)
+
+Example using RAGQueryConfig with different search modes:
+
+```python
+from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
+
+# Vector search
+config = RAGQueryConfig(mode="vector", max_chunks=5)
+
+# Keyword search
+config = RAGQueryConfig(mode="keyword", max_chunks=5)
+
+# Hybrid search with custom RRF ranker
+config = RAGQueryConfig(
+    mode="hybrid",
+    max_chunks=5,
+    ranker=RRFRanker(impact_factor=50.0),  # Custom impact factor
+)
+
+# Hybrid search with weighted ranker
+config = RAGQueryConfig(
+    mode="hybrid",
+    max_chunks=5,
+    ranker=WeightedRanker(alpha=0.7),  # 70% vector, 30% keyword
+)
+
+# Hybrid search with default RRF ranker
+config = RAGQueryConfig(
+    mode="hybrid", max_chunks=5
+)  # Will use RRF with impact_factor=60.0
+```
+
+Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
+
 ## Installation

 You can install SQLite-Vec using pip:
@ -96,3 +197,5 @@ pip install sqlite-vec
 ## Documentation

 See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
+
+[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
    type: Literal["web_search_call"] = "web_search_call"


+@json_schema_type
+class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
+    id: str
+    queries: list[str]
+    status: str
+    type: Literal["file_search_call"] = "file_search_call"
+    results: list[dict[str, Any]] | None = None
+
+
@json_schema_type
 class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
    call_id: str
@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel):
 OpenAIResponseOutput = Annotated[
    OpenAIResponseMessage
    | OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFileSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseOutputMessageMCPCall
    | OpenAIResponseOutputMessageMCPListTools,
@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
 OpenAIResponseInput = Annotated[
    # Responses API allows output messages to be passed in as input
    OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageFileSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseInputFunctionToolCallOutput
    |
@ -397,9 +408,10 @@ class FileSearchRankingOptions(BaseModel):
@json_schema_type
 class OpenAIResponseInputToolFileSearch(BaseModel):
    type: Literal["file_search"] = "file_search"
-    vector_store_id: list[str]
+    vector_store_ids: list[str]
+    filters: dict[str, Any] | None = None
+    max_num_results: int | None = Field(default=10, ge=1, le=50)
    ranking_options: FileSearchRankingOptions | None = None
-    # TODO: add filters


 class ApprovalFilter(BaseModel):
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1038,6 +1038,8 @@ class InferenceProvider(Protocol):
        # vLLM-specific parameters
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        # for fill-in-the-middle type completion
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        """Generate an OpenAI-compatible completion for the given prompt using the specified model.

@ -1058,6 +1060,7 @@ class InferenceProvider(Protocol):
        :param temperature: (Optional) The temperature to use.
        :param top_p: (Optional) The top p to use.
        :param user: (Optional) The user to use.
+        :param suffix: (Optional) The suffix that should be appended to the completion.
        :returns: An OpenAICompletion.
        """
        ...
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -15,6 +15,48 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


+@json_schema_type
+class RRFRanker(BaseModel):
+    """
+    Reciprocal Rank Fusion (RRF) ranker configuration.
+
+    :param type: The type of ranker, always "rrf"
+    :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
+                         Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009).
+    """
+
+    type: Literal["rrf"] = "rrf"
+    impact_factor: float = Field(default=60.0, gt=0.0)  # default of 60 for optimal performance
+
+
+@json_schema_type
+class WeightedRanker(BaseModel):
+    """
+    Weighted ranker configuration that combines vector and keyword scores.
+
+    :param type: The type of ranker, always "weighted"
+    :param alpha: Weight factor between 0 and 1.
+                 0 means only use keyword scores,
+                 1 means only use vector scores,
+                 values in between blend both scores.
+    """
+
+    type: Literal["weighted"] = "weighted"
+    alpha: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
+    )
+
+
+Ranker = Annotated[
+    RRFRanker | WeightedRanker,
+    Field(discriminator="type"),
+]
+register_schema(Ranker, name="Ranker")
+
+
@json_schema_type
 class RAGDocument(BaseModel):
    """
@ -76,7 +118,8 @@ class RAGQueryConfig(BaseModel):
    :param chunk_template: Template for formatting each retrieved chunk in the context.
        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
-    :param mode: Search mode for retrieval—either "vector" or "keyword". Default "vector".
+    :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
+    :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
    """

    # This config defines how a query is generated using the messages
@ -86,6 +129,7 @@ class RAGQueryConfig(BaseModel):
    max_chunks: int = 5
    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
    mode: str | None = None
+    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode

    @field_validator("chunk_template")
    def validate_chunk_template(cls, v: str) -> str:
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -8,7 +8,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Literal, Protocol, runtime_checkable
+from typing import Annotated, Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

@ -16,6 +16,7 @@ from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.strong_typing.schema import register_schema


 class Chunk(BaseModel):
@ -133,6 +134,50 @@ class VectorStoreDeleteResponse(BaseModel):
    deleted: bool = True


+@json_schema_type
+class VectorStoreChunkingStrategyAuto(BaseModel):
+    type: Literal["auto"] = "auto"
+
+
+@json_schema_type
+class VectorStoreChunkingStrategyStaticConfig(BaseModel):
+    chunk_overlap_tokens: int = 400
+    max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
+
+
+@json_schema_type
+class VectorStoreChunkingStrategyStatic(BaseModel):
+    type: Literal["static"] = "static"
+    static: VectorStoreChunkingStrategyStaticConfig
+
+
+VectorStoreChunkingStrategy = Annotated[
+    VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic, Field(discriminator="type")
+]
+register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
+
+
+@json_schema_type
+class VectorStoreFileLastError(BaseModel):
+    code: Literal["server_error"] | Literal["rate_limit_exceeded"]
+    message: str
+
+
+@json_schema_type
+class VectorStoreFileObject(BaseModel):
+    """OpenAI Vector Store File object."""
+
+    id: str
+    object: str = "vector_store.file"
+    attributes: dict[str, Any] = Field(default_factory=dict)
+    chunking_strategy: VectorStoreChunkingStrategy
+    created_at: int
+    last_error: VectorStoreFileLastError | None = None
+    status: Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
+    usage_bytes: int = 0
+    vector_store_id: str
+
+
 class VectorDBStore(Protocol):
    def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...

@ -290,3 +335,21 @@ class VectorIO(Protocol):
        :returns: A VectorStoreSearchResponse containing the search results.
        """
        ...
+
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST")
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        """Attach a file to a vector store.
+
+        :param vector_store_id: The ID of the vector store to attach the file to.
+        :param file_id: The ID of the file to attach to the vector store.
+        :param attributes: The key-value attributes stored with the file, which can be used for filtering.
+        :param chunking_strategy: The chunking strategy to use for the file.
+        :returns: A VectorStoreFileObject representing the attached file.
+        """
+        ...
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@ -426,6 +426,7 @@ class InferenceRouter(Inference):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        logger.debug(
            f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
@ -456,6 +457,7 @@ class InferenceRouter(Inference):
            user=user,
            guided_choice=guided_choice,
            prompt_logprobs=prompt_logprobs,
+            suffix=suffix,
        )

        provider = self.routing_table.get_provider_impl(model_obj.identifier)
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@ -19,6 +19,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable

@ -254,3 +255,20 @@ class VectorIORouter(VectorIO):
            ranking_options=ranking_options,
            rewrite_query=rewrite_query,
        )
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
+        # Route based on vector store ID
+        provider = self.routing_table.get_provider_impl(vector_store_id)
+        return await provider.openai_attach_file_to_vector_store(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+            attributes=attributes,
+            chunking_strategy=chunking_strategy,
+        )
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseInputMessageContentImage,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
+    OpenAIResponseInputToolFileSearch,
    OpenAIResponseInputToolMCP,
    OpenAIResponseMessage,
    OpenAIResponseObject,
@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutput,
    OpenAIResponseOutputMessageContent,
    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFunctionToolCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
@ -62,7 +64,7 @@ from llama_stack.apis.inference.inference import (
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
 )
-from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime
+from llama_stack.apis.tools import RAGQueryConfig, ToolGroups, ToolRuntime
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
 from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
 class ChatCompletionContext(BaseModel):
    model: str
    messages: list[OpenAIMessageParam]
-    tools: list[ChatCompletionToolParam] | None = None
+    response_tools: list[OpenAIResponseInputTool] | None = None
+    chat_tools: list[ChatCompletionToolParam] | None = None
    mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
    temperature: float | None
    response_format: OpenAIResponseFormatParam
@ -388,7 +391,8 @@ class OpenAIResponsesImpl:
        ctx = ChatCompletionContext(
            model=model,
            messages=messages,
-            tools=chat_tools,
+            response_tools=tools,
+            chat_tools=chat_tools,
            mcp_tool_to_server=mcp_tool_to_server,
            temperature=temperature,
            response_format=response_format,
@ -417,7 +421,7 @@ class OpenAIResponsesImpl:
            completion_result = await self.inference_api.openai_chat_completion(
                model=ctx.model,
                messages=messages,
-                tools=ctx.tools,
+                tools=ctx.chat_tools,
                stream=True,
                temperature=ctx.temperature,
                response_format=ctx.response_format,
@ -606,6 +610,12 @@ class OpenAIResponsesImpl:
                if not tool:
                    raise ValueError(f"Tool {tool_name} not found")
                chat_tools.append(make_openai_tool(tool_name, tool))
+            elif input_tool.type == "file_search":
+                tool_name = "knowledge_search"
+                tool = await self.tool_groups_api.get_tool(tool_name)
+                if not tool:
+                    raise ValueError(f"Tool {tool_name} not found")
+                chat_tools.append(make_openai_tool(tool_name, tool))
            elif input_tool.type == "mcp":
                always_allowed = None
                never_allowed = None
@ -667,6 +677,7 @@ class OpenAIResponsesImpl:

        tool_call_id = tool_call.id
        function = tool_call.function
+        tool_kwargs = json.loads(function.arguments) if function.arguments else {}

        if not function or not tool_call_id or not function.name:
            return None, None
@ -680,12 +691,26 @@ class OpenAIResponsesImpl:
                    endpoint=mcp_tool.server_url,
                    headers=mcp_tool.headers or {},
                    tool_name=function.name,
-                    kwargs=json.loads(function.arguments) if function.arguments else {},
+                    kwargs=tool_kwargs,
                )
            else:
+                if function.name == "knowledge_search":
+                    response_file_search_tool = next(
+                        t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)
+                    )
+                    if response_file_search_tool:
+                        if response_file_search_tool.filters:
+                            logger.warning("Filters are not yet supported for file_search tool")
+                        if response_file_search_tool.ranking_options:
+                            logger.warning("Ranking options are not yet supported for file_search tool")
+                        tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids
+                        tool_kwargs["query_config"] = RAGQueryConfig(
+                            mode="vector",
+                            max_chunks=response_file_search_tool.max_num_results,
+                        )
                result = await self.tool_runtime_api.invoke_tool(
                    tool_name=function.name,
-                    kwargs=json.loads(function.arguments) if function.arguments else {},
+                    kwargs=tool_kwargs,
                )
        except Exception as e:
            error_exc = e
@ -713,6 +738,27 @@ class OpenAIResponsesImpl:
                )
                if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
                    message.status = "failed"
+            elif function.name == "knowledge_search":
+                message = OpenAIResponseOutputMessageFileSearchToolCall(
+                    id=tool_call_id,
+                    queries=[tool_kwargs.get("query", "")],
+                    status="completed",
+                )
+                if "document_ids" in result.metadata:
+                    message.results = []
+                    for i, doc_id in enumerate(result.metadata["document_ids"]):
+                        text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
+                        score = result.metadata["scores"][i] if "scores" in result.metadata else None
+                        message.results.append(
+                            {
+                                "file_id": doc_id,
+                                "filename": doc_id,
+                                "text": text,
+                                "score": score,
+                            }
+                        )
+                if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
+                    message.status = "failed"
            else:
                raise ValueError(f"Unknown tool {function.name} called")

--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -121,8 +121,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                vector_db_id=vector_db_id,
                query=query,
                params={
-                    "max_chunks": query_config.max_chunks,
                    "mode": query_config.mode,
+                    "max_chunks": query_config.max_chunks,
+                    "score_threshold": 0.0,
+                    "ranker": query_config.ranker,
                },
            )
            for vector_db_id in vector_db_ids
@ -170,6 +172,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
            content=picked,
            metadata={
                "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
+                "chunks": [c.content for c in chunks[: len(picked)]],
+                "scores": scores[: len(picked)],
            },
        )

--- a/llama_stack/providers/inline/vector_io/faiss/init.py
+++ b/llama_stack/providers/inline/vector_io/faiss/init.py
@ -16,6 +16,6 @@ async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]):

    assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"

-    impl = FaissVectorIOAdapter(config, deps[Api.inference])
+    impl = FaissVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -15,6 +15,7 @@ import faiss
 import numpy as np
 from numpy.typing import NDArray

+from llama_stack.apis.files import Files
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.inference.inference import Inference
 from llama_stack.apis.vector_dbs import VectorDB
@ -130,11 +131,23 @@ class FaissIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in FAISS")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in FAISS")
+

 class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
-    def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None:
+    def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
        self.config = config
        self.inference_api = inference_api
+        self.files_api = files_api
        self.cache: dict[str, VectorDBWithIndex] = {}
        self.kvstore: KVStore | None = None
        self.openai_vector_stores: dict[str, dict[str, Any]] = {}
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
@ -15,6 +15,6 @@ async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]):
    from .sqlite_vec import SQLiteVecVectorIOAdapter

    assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
-    impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference])
+    impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -17,6 +17,7 @@ import numpy as np
 import sqlite_vec
 from numpy.typing import NDArray

+from llama_stack.apis.files.files import Files
 from llama_stack.apis.inference.inference import Inference
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import (
@ -26,14 +27,20 @@ from llama_stack.apis.vector_io import (
 )
 from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex
+from llama_stack.providers.utils.memory.vector_store import (
+    RERANKER_TYPE_RRF,
+    RERANKER_TYPE_WEIGHTED,
+    EmbeddingIndex,
+    VectorDBWithIndex,
+)

 logger = logging.getLogger(__name__)

 # Specifying search mode is dependent on the VectorIO provider.
 VECTOR_SEARCH = "vector"
 KEYWORD_SEARCH = "keyword"
-SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH}
+HYBRID_SEARCH = "hybrid"
+SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH}


 def serialize_vector(vector: list[float]) -> bytes:
@ -50,6 +57,59 @@ def _create_sqlite_connection(db_path):
    return connection


+def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
+    """Normalize scores to [0,1] range using min-max normalization."""
+    if not scores:
+        return {}
+    min_score = min(scores.values())
+    max_score = max(scores.values())
+    score_range = max_score - min_score
+    if score_range > 0:
+        return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
+    return {doc_id: 1.0 for doc_id in scores}
+
+
+def _weighted_rerank(
+    vector_scores: dict[str, float],
+    keyword_scores: dict[str, float],
+    alpha: float = 0.5,
+) -> dict[str, float]:
+    """ReRanker that uses weighted average of scores."""
+    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+    normalized_vector_scores = _normalize_scores(vector_scores)
+    normalized_keyword_scores = _normalize_scores(keyword_scores)
+
+    return {
+        doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
+        + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
+        for doc_id in all_ids
+    }
+
+
+def _rrf_rerank(
+    vector_scores: dict[str, float],
+    keyword_scores: dict[str, float],
+    impact_factor: float = 60.0,
+) -> dict[str, float]:
+    """ReRanker that uses Reciprocal Rank Fusion."""
+    # Convert scores to ranks
+    vector_ranks = {
+        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
+    }
+    keyword_ranks = {
+        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
+    }
+
+    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+    rrf_scores = {}
+    for doc_id in all_ids:
+        vector_rank = vector_ranks.get(doc_id, float("inf"))
+        keyword_rank = keyword_ranks.get(doc_id, float("inf"))
+        # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
+        rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
+    return rrf_scores
+
+
 class SQLiteVecIndex(EmbeddingIndex):
    """
    An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@ -254,8 +314,6 @@ class SQLiteVecIndex(EmbeddingIndex):
        """
        Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
        """
-        if query_string is None:
-            raise ValueError("query_string is required for keyword search.")

        def _execute_query():
            connection = _create_sqlite_connection(self.db_path)
@ -293,6 +351,81 @@ class SQLiteVecIndex(EmbeddingIndex):
            scores.append(score)
        return QueryChunksResponse(chunks=chunks, scores=scores)

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str = RERANKER_TYPE_RRF,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        """
+        Hybrid search using a configurable re-ranking strategy.
+
+        Args:
+            embedding: The query embedding vector
+            query_string: The text query for keyword search
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+            reranker_type: Type of reranker to use ("rrf" or "weighted")
+            reranker_params: Parameters for the reranker
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        if reranker_params is None:
+            reranker_params = {}
+
+        # Get results from both search methods
+        vector_response = await self.query_vector(embedding, k, score_threshold)
+        keyword_response = await self.query_keyword(query_string, k, score_threshold)
+
+        # Convert responses to score dictionaries using generate_chunk_id
+        vector_scores = {
+            generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
+            for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
+        }
+        keyword_scores = {
+            generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
+            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
+        }
+
+        # Combine scores using the specified reranker
+        if reranker_type == RERANKER_TYPE_WEIGHTED:
+            alpha = reranker_params.get("alpha", 0.5)
+            combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
+        else:
+            # Default to RRF for None, RRF, or any unknown types
+            impact_factor = reranker_params.get("impact_factor", 60.0)
+            combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
+
+        # Sort by combined score and get top k results
+        sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
+        top_k_items = sorted_items[:k]
+
+        # Filter by score threshold
+        filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
+
+        # Create a map of chunk_id to chunk for both responses
+        chunk_map = {}
+        for c in vector_response.chunks:
+            chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
+            chunk_map[chunk_id] = c
+        for c in keyword_response.chunks:
+            chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
+            chunk_map[chunk_id] = c
+
+        # Use the map to look up chunks by their IDs
+        chunks = []
+        scores = []
+        for doc_id, score in filtered_items:
+            if doc_id in chunk_map:
+                chunks.append(chunk_map[doc_id])
+                scores.append(score)
+
+        return QueryChunksResponse(chunks=chunks, scores=scores)
+

 class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    """
@ -301,9 +434,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
    and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
    """

-    def __init__(self, config, inference_api: Inference) -> None:
+    def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None:
        self.config = config
        self.inference_api = inference_api
+        self.files_api = files_api
        self.cache: dict[str, VectorDBWithIndex] = {}
        self.openai_vector_stores: dict[str, dict[str, Any]] = {}

@ -343,7 +477,9 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
            vector_db_data = row[0]
            vector_db = VectorDB.model_validate_json(vector_db_data)
            index = await SQLiteVecIndex.create(
-                vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
+                vector_db.embedding_dimension,
+                self.config.db_path,
+                vector_db.identifier,
            )
            self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

@ -369,7 +505,11 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
                connection.close()

        await asyncio.to_thread(_register_db)
-        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
+        index = await SQLiteVecIndex.create(
+            vector_db.embedding_dimension,
+            self.config.db_path,
+            vector_db.identifier,
+        )
        self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

    async def list_vector_dbs(self) -> list[VectorDB]:
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -24,6 +24,7 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
            deprecation_warning="Please use the `inline::faiss` provider instead.",
            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
        ),
        InlineProviderSpec(
            api=Api.vector_io,
@ -32,6 +33,7 @@ def available_providers() -> list[ProviderSpec]:
            module="llama_stack.providers.inline.vector_io.faiss",
            config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
        ),
        # NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
        # source distribution and the wheels are not available for all platforms.
@ -42,6 +44,7 @@ def available_providers() -> list[ProviderSpec]:
            module="llama_stack.providers.inline.vector_io.sqlite_vec",
            config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
        ),
        InlineProviderSpec(
            api=Api.vector_io,
@ -51,6 +54,7 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
            deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
        ),
        remote_provider_spec(
            Api.vector_io,
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -318,6 +318,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)

--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -316,6 +316,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        provider_model_id = await self._get_provider_model_id(model)

--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -33,7 +33,6 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
-    OpenAIEmbeddingsResponse,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAICompletion,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
 )
@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import (
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
+    b64_encode_openai_embeddings_response,
    get_sampling_options,
    prepare_openai_completion_params,
+    prepare_openai_embeddings_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -386,7 +389,35 @@ class OllamaInferenceAdapter(
        dimensions: int | None = None,
        user: str | None = None,
    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
+        model_obj = await self._get_model(model)
+        if model_obj.model_type != ModelType.embedding:
+            raise ValueError(f"Model {model} is not an embedding model")
+
+        if model_obj.provider_resource_id is None:
+            raise ValueError(f"Model {model} has no provider_resource_id set")
+
+        # Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters
+        params = prepare_openai_embeddings_params(
+            model=model_obj.provider_resource_id,
+            input=input,
+            encoding_format=encoding_format,
+            dimensions=dimensions,
+            user=user,
+        )
+
+        response = await self.openai_client.embeddings.create(**params)
+        data = b64_encode_openai_embeddings_response(response.data, encoding_format)
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+        # TODO: Investigate why model_obj.identifier is used instead of response.model
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=model_obj.identifier,
+            usage=usage,
+        )

    async def openai_completion(
        self,
@ -409,6 +440,7 @@ class OllamaInferenceAdapter(
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        if not isinstance(prompt, str):
            raise ValueError("Ollama does not support non-string prompts for completion")
@ -432,6 +464,7 @@ class OllamaInferenceAdapter(
            temperature=temperature,
            top_p=top_p,
            user=user,
+            suffix=suffix,
        )
        return await self.openai_client.completions.create(**params)  # type: ignore

--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -90,6 +90,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        if guided_choice is not None:
            logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
@ -117,6 +118,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
            temperature=temperature,
            top_p=top_p,
            user=user,
+            suffix=suffix,
        )
        return await self._openai_client.completions.create(**params)

--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -242,6 +242,7 @@ class PassthroughInferenceAdapter(Inference):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        client = self._get_client()
        model_obj = await self.model_store.get_model(model)
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -299,6 +299,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -559,6 +559,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        self._lazy_initialize_client()
        model_obj = await self._get_model(model)
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -313,6 +313,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -104,6 +105,17 @@ class ChromaIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in Chroma")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in Chroma")
+

 class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -241,3 +253,12 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        rewrite_query: bool | None = False,
    ) -> VectorStoreSearchResponsePage:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -25,6 +25,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -102,6 +103,17 @@ class MilvusIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in Milvus")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in Milvus")
+

 class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -240,6 +252,15 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    ) -> VectorStoreSearchResponsePage:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")

+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")
+

 def generate_chunk_id(document_id: str, chunk_text: str) -> str:
    """Generate a unique chunk ID using a hash of document ID and chunk text."""
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -128,6 +128,17 @@ class PGVectorIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in PGVector")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in PGVector")
+
    async def delete(self):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
 from llama_stack.providers.utils.memory.vector_store import (
@ -111,6 +112,17 @@ class QdrantIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in Qdrant")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in Qdrant")
+
    async def delete(self):
        await self.client.delete_collection(collection_name=self.collection_name)

@ -241,3 +253,12 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        rewrite_query: bool | None = False,
    ) -> VectorStoreSearchResponsePage:
        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -92,6 +92,17 @@ class WeaviateIndex(EmbeddingIndex):
    ) -> QueryChunksResponse:
        raise NotImplementedError("Keyword search is not supported in Weaviate")

+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError("Hybrid search is not supported in Weaviate")
+

 class WeaviateVectorIOAdapter(
    VectorIO,
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -4,8 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import base64
-import struct
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAICompletion,
-    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
+    b64_encode_openai_embeddings_response,
    convert_message_to_openai_dict_new,
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin(
        )

        # Convert response to OpenAI format
-        data = []
-        for i, embedding_data in enumerate(response["data"]):
-            # we encode to base64 if the encoding format is base64 in the request
-            if encoding_format == "base64":
-                byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
-                embedding = base64.b64encode(byte_data).decode("utf-8")
-            else:
-                embedding = embedding_data["embedding"]
-
-            data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
+        data = b64_encode_openai_embeddings_response(response.data, encoding_format)

        usage = OpenAIEmbeddingUsage(
            prompt_tokens=response["usage"]["prompt_tokens"],
@ -336,6 +325,7 @@ class LiteLLMOpenAIMixin(
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -3,8 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import base64
 import json
 import logging
+import struct
 import time
 import uuid
 import warnings
@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
    OpenAICompletion,
    OpenAICompletionChoice,
+    OpenAIEmbeddingData,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ToolConfig,
@ -1287,6 +1290,7 @@ class OpenAICompletionToLlamaStackMixin:
        user: str | None = None,
        guided_choice: list[str] | None = None,
        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
    ) -> OpenAICompletion:
        if stream:
            raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
@ -1483,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin:
            model=model,
            object="chat.completion",
        )
+
+
+def prepare_openai_embeddings_params(
+    model: str,
+    input: str | list[str],
+    encoding_format: str | None = "float",
+    dimensions: int | None = None,
+    user: str | None = None,
+):
+    if model is None:
+        raise ValueError("Model must be provided for embeddings")
+
+    input_list = [input] if isinstance(input, str) else input
+
+    params: dict[str, Any] = {
+        "model": model,
+        "input": input_list,
+    }
+
+    if encoding_format is not None:
+        params["encoding_format"] = encoding_format
+    if dimensions is not None:
+        params["dimensions"] = dimensions
+    if user is not None:
+        params["user"] = user
+
+    return params
+
+
+def b64_encode_openai_embeddings_response(
+    response_data: dict, encoding_format: str | None = "float"
+) -> list[OpenAIEmbeddingData]:
+    """
+    Process the OpenAI embeddings response to encode the embeddings in base64 format if specified.
+    """
+    data = []
+    for i, embedding_data in enumerate(response_data):
+        if encoding_format == "base64":
+            byte_array = bytearray()
+            for embedding_value in embedding_data.embedding:
+                byte_array.extend(struct.pack("f", float(embedding_value)))
+
+            response_embedding = base64.b64encode(byte_array).decode("utf-8")
+        else:
+            response_embedding = embedding_data.embedding
+        data.append(
+            OpenAIEmbeddingData(
+                embedding=response_embedding,
+                index=i,
+            )
+        )
+    return data
--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -5,11 +5,13 @@
 # the root directory of this source tree.

 import logging
+import mimetypes
 import time
 import uuid
 from abc import ABC, abstractmethod
 from typing import Any

+from llama_stack.apis.files import Files
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import (
    QueryChunksResponse,
@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
    VectorStoreSearchResponse,
    VectorStoreSearchResponsePage,
 )
+from llama_stack.apis.vector_io.vector_io import (
+    Chunk,
+    VectorStoreChunkingStrategy,
+    VectorStoreChunkingStrategyAuto,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreFileLastError,
+    VectorStoreFileObject,
+)
+from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks

 logger = logging.getLogger(__name__)

@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):

    # These should be provided by the implementing class
    openai_vector_stores: dict[str, dict[str, Any]]
+    files_api: Files | None

    @abstractmethod
    async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
        """Unregister a vector database (provider-specific implementation)."""
        pass

+    @abstractmethod
+    async def insert_chunks(
+        self,
+        vector_db_id: str,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
+    ) -> None:
+        """Insert chunks into a vector database (provider-specific implementation)."""
+        pass
+
    @abstractmethod
    async def query_chunks(
        self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC):
            if metadata[key] != value:
                return False
        return True
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        attributes = attributes or {}
+        chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
+
+        vector_store_file_object = VectorStoreFileObject(
+            id=file_id,
+            attributes=attributes,
+            chunking_strategy=chunking_strategy,
+            created_at=int(time.time()),
+            status="in_progress",
+            vector_store_id=vector_store_id,
+        )
+
+        if not hasattr(self, "files_api") or not self.files_api:
+            vector_store_file_object.status = "failed"
+            vector_store_file_object.last_error = VectorStoreFileLastError(
+                code="server_error",
+                message="Files API is not available",
+            )
+            return vector_store_file_object
+
+        if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
+            max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
+            chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
+        else:
+            # Default values from OpenAI API spec
+            max_chunk_size_tokens = 800
+            chunk_overlap_tokens = 400
+
+        try:
+            file_response = await self.files_api.openai_retrieve_file(file_id)
+            mime_type, _ = mimetypes.guess_type(file_response.filename)
+            content_response = await self.files_api.openai_retrieve_file_content(file_id)
+
+            content = content_from_data_and_mime_type(content_response.body, mime_type)
+
+            chunks = make_overlapped_chunks(
+                file_id,
+                content,
+                max_chunk_size_tokens,
+                chunk_overlap_tokens,
+                attributes,
+            )
+
+            if not chunks:
+                vector_store_file_object.status = "failed"
+                vector_store_file_object.last_error = VectorStoreFileLastError(
+                    code="server_error",
+                    message="No chunks were generated from the file",
+                )
+                return vector_store_file_object
+
+            await self.insert_chunks(
+                vector_db_id=vector_store_id,
+                chunks=chunks,
+            )
+        except Exception as e:
+            logger.error(f"Error attaching file to vector store: {e}")
+            vector_store_file_object.status = "failed"
+            vector_store_file_object.last_error = VectorStoreFileLastError(
+                code="server_error",
+                message=str(e),
+            )
+            return vector_store_file_object
+
+        vector_store_file_object.status = "completed"
+
+        return vector_store_file_object
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (

 log = logging.getLogger(__name__)

+# Constants for reranker types
+RERANKER_TYPE_RRF = "rrf"
+RERANKER_TYPE_WEIGHTED = "weighted"
+

 def parse_pdf(data: bytes) -> str:
    # For PDF and DOC/DOCX files, we can't reliably convert to string
@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str:
        data = unquote(data)
        encoding = parts["encoding"] or "utf-8"
        data = data.encode(encoding)
+    return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))

-    encoding = parts["encoding"]
-    if not encoding:
-        import chardet

-        detected = chardet.detect(data)
-        encoding = detected["encoding"]
+def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
+    if isinstance(data, bytes):
+        if not encoding:
+            import chardet

-    mime_type = parts["mimetype"]
-    mime_category = mime_type.split("/")[0]
+            detected = chardet.detect(data)
+            encoding = detected["encoding"]
+
+    mime_category = mime_type.split("/")[0] if mime_type else None
    if mime_category == "text":
        # For text-based files (including CSV, MD)
        return data.decode(encoding)
@ -200,6 +206,18 @@ class EmbeddingIndex(ABC):
    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
        raise NotImplementedError()

+    @abstractmethod
+    async def query_hybrid(
+        self,
+        embedding: NDArray,
+        query_string: str,
+        k: int,
+        score_threshold: float,
+        reranker_type: str,
+        reranker_params: dict[str, Any] | None = None,
+    ) -> QueryChunksResponse:
+        raise NotImplementedError()
+
    @abstractmethod
    async def delete(self):
        raise NotImplementedError()
@ -243,10 +261,29 @@ class VectorDBWithIndex:
        k = params.get("max_chunks", 3)
        mode = params.get("mode")
        score_threshold = params.get("score_threshold", 0.0)
+
+        # Get ranker configuration
+        ranker = params.get("ranker")
+        if ranker is None:
+            # Default to RRF with impact_factor=60.0
+            reranker_type = RERANKER_TYPE_RRF
+            reranker_params = {"impact_factor": 60.0}
+        else:
+            reranker_type = ranker.type
+            reranker_params = (
+                {"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
+            )
+
        query_string = interleaved_content_as_str(query)
        if mode == "keyword":
            return await self.index.query_keyword(query_string, k, score_threshold)
+
+        # Calculate embeddings for both vector and hybrid modes
+        embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
+        query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+        if mode == "hybrid":
+            return await self.index.query_hybrid(
+                query_vector, query_string, k, score_threshold, reranker_type, reranker_params
+            )
        else:
-            embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
-            query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
            return await self.index.query_vector(query_vector, k, score_threshold)
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -23,6 +23,8 @@ distribution_spec:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
+    files:
+    - inline::localfs
    post_training:
    - inline::huggingface
    tool_runtime:
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
    ShieldInput,
    ToolGroupInput,
 )
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
 from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
@ -29,6 +30,7 @@ def get_distribution_template() -> DistributionTemplate:
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "files": ["inline::localfs"],
        "post_training": ["inline::huggingface"],
        "tool_runtime": [
            "remote::brave-search",
@ -49,6 +51,11 @@ def get_distribution_template() -> DistributionTemplate:
        provider_type="inline::faiss",
        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
    )
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
    posttraining_provider = Provider(
        provider_id="huggingface",
        provider_type="inline::huggingface",
@ -98,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate:
                provider_overrides={
                    "inference": [inference_provider],
                    "vector_io": [vector_io_provider_faiss],
+                    "files": [files_provider],
                    "post_training": [posttraining_provider],
                },
                default_models=[inference_model, embedding_model],
@ -107,6 +115,7 @@ def get_distribution_template() -> DistributionTemplate:
                provider_overrides={
                    "inference": [inference_provider],
                    "vector_io": [vector_io_provider_faiss],
+                    "files": [files_provider],
                    "post_training": [posttraining_provider],
                    "safety": [
                        Provider(
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - post_training
 - safety
@ -84,6 +85,14 @@ providers:
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
  post_training:
  - provider_id: huggingface
    provider_type: inline::huggingface
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - post_training
 - safety
@ -82,6 +83,14 @@ providers:
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
  post_training:
  - provider_id: huggingface
    provider_type: inline::huggingface
--- a/llama_stack/templates/starter/build.yaml
+++ b/llama_stack/templates/starter/build.yaml
@ -17,6 +17,8 @@ distribution_spec:
    - inline::sqlite-vec
    - remote::chromadb
    - remote::pgvector
+    files:
+    - inline::localfs
    safety:
    - inline::llama-guard
    agents:
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - safety
 - scoring
@ -75,6 +76,14 @@ providers:
      db: ${env.PGVECTOR_DB:}
      user: ${env.PGVECTOR_USER:}
      password: ${env.PGVECTOR_PASSWORD:}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import (
    ShieldInput,
    ToolGroupInput,
 )
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
@ -134,6 +135,7 @@ def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+        "files": ["inline::localfs"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
@ -170,6 +172,11 @@ def get_distribution_template() -> DistributionTemplate:
            ),
        ),
    ]
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
    embedding_provider = Provider(
        provider_id="sentence-transformers",
        provider_type="inline::sentence-transformers",
@ -212,6 +219,7 @@ def get_distribution_template() -> DistributionTemplate:
                provider_overrides={
                    "inference": inference_providers + [embedding_provider],
                    "vector_io": vector_io_providers,
+                    "files": [files_provider],
                },
                default_models=default_models + [embedding_model],
                default_tool_groups=default_tool_groups,
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -22,9 +22,6 @@ def provider_from_model(client_with_models, model_id):


 def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
-    if isinstance(client_with_models, LlamaStackAsLibraryClient):
-        pytest.skip("OpenAI completions are not supported when testing with library client yet.")
-
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
        "inline::meta-reference",
@ -44,6 +41,23 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")


+def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
+    # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
+    # Use this to specifically test this API functionality.
+
+    # pytest -sv --stack-config="inference=ollama" \
+    # tests/integration/inference/test_openai_completion.py \
+    # --text-model qwen2.5-coder:1.5b \
+    # -k test_openai_completion_non_streaming_suffix
+
+    if model_id != "qwen2.5-coder:1.5b":
+        pytest.skip(f"Suffix is not supported for the model: {model_id}.")
+
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type != "remote::ollama":
+        pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
+
+
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
@ -102,6 +116,32 @@ def test_openai_completion_non_streaming(llama_stack_client, client_with_models,
    assert len(choice.text) > 10


+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:suffix",
+    ],
+)
+def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    skip_if_model_doesnt_support_suffix(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    # ollama needs more verbose prompting for some reason here...
+    response = llama_stack_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        stream=False,
+        suffix=tc["suffix"],
+        max_tokens=10,
+    )
+
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert len(choice.text) > 5
+    assert "france" in choice.text.lower()
+
+
@pytest.mark.parametrize(
    "test_case",
    [
--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
        "remote::runpod",
        "remote::sambanova",
        "remote::tgi",
-        "remote::ollama",
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")

--- a/tests/integration/test_cases/inference/completion.json
+++ b/tests/integration/test_cases/inference/completion.json
@ -4,6 +4,12 @@
            "content": "Complete the sentence using one word: Roses are red, violets are "
        }
    },
+    "suffix": {
+        "data": {
+            "content": "The capital of ",
+            "suffix": "is Paris."
+        }
+    },
    "non_streaming": {
        "data": {
            "content": "Micheael Jordan is born in ",
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@ -84,6 +84,28 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa
    assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"


+@pytest.mark.asyncio
+async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings):
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Create a query embedding that's similar to the first chunk
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 5"
+
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    assert len(response.chunks) == 3, f"Expected 3 results, got {len(response.chunks)}"
+    # Verify scores are in descending order (higher is better)
+    assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
@pytest.mark.asyncio
 async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
    # Re-initialize with a clean index
@ -141,3 +163,355 @@ def test_generate_chunk_id():
        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
        "f68df25d-d9aa-ab4d-5684-64a233add20d",
    ]
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
+    """Test hybrid search when keyword search returns no matches - should still return vector results."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Use a non-existent keyword but a valid vector query
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 499"
+
+    # First verify keyword search returns no results
+    keyword_response = await sqlite_vec_index.query_keyword(query_string, k=5, score_threshold=0.0)
+    assert len(keyword_response.chunks) == 0, "Keyword search should return no results"
+
+    # Get hybrid results
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    # Should still get results from vector search
+    assert len(response.chunks) > 0, "Should get results from vector search even with no keyword matches"
+    # Verify scores are in descending order
+    assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings):
+    """Test hybrid search with a high score threshold."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Use a very high score threshold that no results will meet
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 5"
+
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=1000.0,  # Very high threshold
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    # Should return no results due to high threshold
+    assert len(response.chunks) == 0
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_different_embedding(
+    sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension
+):
+    """Test hybrid search with a different embedding than the stored ones."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Create a random embedding that's different from stored ones
+    query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
+    query_string = "Sentence 5"
+
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    # Should still get results if keyword matches exist
+    assert len(response.chunks) > 0
+    # Verify scores are in descending order
+    assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings):
+    """Test that RRF properly combines rankings when documents appear in both search methods."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Create a query embedding that's similar to the first chunk
+    query_embedding = sample_embeddings[0]
+    # Use a keyword that appears in multiple documents
+    query_string = "Sentence 5"
+
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=5,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    # Verify we get results from both search methods
+    assert len(response.chunks) > 0
+    # Verify scores are in descending order (RRF should maintain this)
+    assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings):
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Create a query embedding that's similar to the first chunk
+    query_embedding = sample_embeddings[0]
+    # Use a keyword that appears in the first document
+    query_string = "Sentence 0 from document 0"
+
+    # Test weighted re-ranking
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="weighted",
+        reranker_params={"alpha": 0.5},
+    )
+    assert len(response.chunks) == 1
+    # Score should be weighted average of normalized keyword score and vector score
+    assert response.scores[0] > 0.5  # Both scores should be high
+
+    # Test RRF re-ranking
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+    assert len(response.chunks) == 1
+    # RRF score should be sum of reciprocal ranks
+    assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6)  # 1/(60+1) + 1/(60+1)
+
+    # Test default re-ranking (should be RRF)
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+    assert len(response.chunks) == 1
+    assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6)  # Should behave like RRF
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings):
+    """Test hybrid search with documents that appear in only one search method."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # Create a query embedding that's similar to the first chunk
+    query_embedding = sample_embeddings[0]
+    # Use a keyword that appears in a different document
+    query_string = "Sentence 9 from document 2"
+
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+
+    # Should get results from both search methods
+    assert len(response.chunks) > 0
+    # Verify scores are in descending order
+    assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+    # Verify we get results from both the vector-similar document and keyword-matched document
+    doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks}
+    assert "document-0" in doc_ids  # From vector search
+    assert "document-2" in doc_ids  # From keyword search
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_weighted_reranker_parametrization(
+    sqlite_vec_index, sample_chunks, sample_embeddings
+):
+    """Test WeightedReRanker with different alpha values."""
+    # Re-add data before each search to ensure test isolation
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 0 from document 0"
+
+    # alpha=1.0 (should behave like pure keyword)
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="weighted",
+        reranker_params={"alpha": 1.0},
+    )
+    assert len(response.chunks) > 0  # Should get at least one result
+    assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+    # alpha=0.0 (should behave like pure vector)
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="weighted",
+        reranker_params={"alpha": 0.0},
+    )
+    assert len(response.chunks) > 0  # Should get at least one result
+    assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+    # alpha=0.7 (should be a mix)
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="weighted",
+        reranker_params={"alpha": 0.7},
+    )
+    assert len(response.chunks) > 0  # Should get at least one result
+    assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings):
+    """Test RRFReRanker with different impact factors."""
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 0 from document 0"
+
+    # impact_factor=10
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 10.0},
+    )
+    assert len(response.chunks) == 1
+    assert response.scores[0] == pytest.approx(2.0 / 11.0, rel=1e-6)
+
+    # impact_factor=100
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=1,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 100.0},
+    )
+    assert len(response.chunks) == 1
+    assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6)
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings):
+    await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+    # No results from either search - use a completely different embedding and a nonzero threshold
+    query_embedding = np.ones_like(sample_embeddings[0]) * -1  # Very different from sample embeddings
+    query_string = "no_such_keyword_that_will_never_match"
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=0.1,  # Nonzero threshold to filter out low-similarity matches
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+    assert len(response.chunks) == 0
+
+    # All results below threshold
+    query_embedding = sample_embeddings[0]
+    query_string = "Sentence 0 from document 0"
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=3,
+        score_threshold=1000.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+    assert len(response.chunks) == 0
+
+    # Large k value
+    response = await sqlite_vec_index.query_hybrid(
+        embedding=query_embedding,
+        query_string=query_string,
+        k=100,
+        score_threshold=0.0,
+        reranker_type="rrf",
+        reranker_params={"impact_factor": 60.0},
+    )
+    # Should not error, should return all available results
+    assert len(response.chunks) > 0
+    assert len(response.chunks) <= 100
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_tie_breaking(
+    sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
+):
+    """Test tie-breaking and determinism when scores are equal."""
+    # Create two chunks with the same content and embedding
+    chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
+    chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
+    chunks = [chunk1, chunk2]
+    # Use the same embedding for both chunks to ensure equal scores
+    same_embedding = sample_embeddings[0]
+    embeddings = np.array([same_embedding, same_embedding])
+
+    # Clear existing data and recreate index
+    await sqlite_vec_index.delete()
+    temp_dir = tmp_path_factory.getbasetemp()
+    db_path = str(temp_dir / "test_sqlite.db")
+    sqlite_vec_index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank")
+    await sqlite_vec_index.add_chunks(chunks, embeddings)
+
+    # Query with the same embedding and content to ensure equal scores
+    query_embedding = same_embedding
+    query_string = "identical"
+
+    # Run multiple queries to verify determinism
+    responses = []
+    for _ in range(3):
+        response = await sqlite_vec_index.query_hybrid(
+            embedding=query_embedding,
+            query_string=query_string,
+            k=2,
+            score_threshold=0.0,
+            reranker_type="rrf",
+            reranker_params={"impact_factor": 60.0},
+        )
+        responses.append(response)
+
+    # Verify all responses are identical
+    first_response = responses[0]
+    for response in responses[1:]:
+        assert response.chunks == first_response.chunks
+        assert response.scores == first_response.scores
+
+    # Verify both chunks are returned with equal scores
+    assert len(first_response.chunks) == 2
+    assert first_response.scores[0] == first_response.scores[1]
+    assert {chunk.metadata["document_id"] for chunk in first_response.chunks} == {"docA", "docB"}
--- a/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf
+++ b/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@ -31,6 +31,25 @@ test_response_web_search:
        search_context_size: "low"
      output: "128"

+test_response_file_search:
+  test_name: test_response_file_search
+  test_params:
+    case:
+    - case_id: "llama_experts"
+      input: "How many experts does the Llama 4 Maverick model have?"
+      tools:
+      - type: file_search
+        # vector_store_ids param for file_search tool gets added by the test runner
+      file_content: "Llama 4 Maverick has 128 experts"
+      output: "128"
+    - case_id: "llama_experts_pdf"
+      input: "How many experts does the Llama 4 Maverick model have?"
+      tools:
+      - type: file_search
+        # vector_store_ids param for file_search toolgets added by the test runner
+      file_path: "pdfs/llama_stack_and_models.pdf"
+      output: "128"
+
 test_response_mcp_tool:
  test_name: test_response_mcp_tool
  test_params:
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.

 import json
+import os
+import time

 import httpx
 import openai
@ -23,6 +25,31 @@ from tests.verifications.openai_api.fixtures.load import load_test_cases
 responses_test_cases = load_test_cases("responses")


+def _new_vector_store(openai_client, name):
+    # Ensure we don't reuse an existing vector store
+    vector_stores = openai_client.vector_stores.list()
+    for vector_store in vector_stores:
+        if vector_store.name == name:
+            openai_client.vector_stores.delete(vector_store_id=vector_store.id)
+
+    # Create a new vector store
+    vector_store = openai_client.vector_stores.create(
+        name=name,
+    )
+    return vector_store
+
+
+def _upload_file(openai_client, name, file_path):
+    # Ensure we don't reuse an existing file
+    files = openai_client.files.list()
+    for file in files:
+        if file.filename == name:
+            openai_client.files.delete(file_id=file.id)
+
+    # Upload a text file with our document content
+    return openai_client.files.create(file=open(file_path, "rb"), purpose="assistants")
+
+
@pytest.mark.parametrize(
    "case",
    responses_test_cases["test_response_basic"]["test_params"]["case"],
@ -258,6 +285,111 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
    assert case["output"].lower() in response.output_text.lower().strip()


+@pytest.mark.parametrize(
+    "case",
+    responses_test_cases["test_response_file_search"]["test_params"]["case"],
+    ids=case_id_generator,
+)
+def test_response_non_streaming_file_search(
+    request, openai_client, model, provider, verification_config, tmp_path, case
+):
+    if isinstance(openai_client, LlamaStackAsLibraryClient):
+        pytest.skip("Responses API file search is not yet supported in library client.")
+
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    vector_store = _new_vector_store(openai_client, "test_vector_store")
+
+    if "file_content" in case:
+        file_name = "test_response_non_streaming_file_search.txt"
+        file_path = tmp_path / file_name
+        file_path.write_text(case["file_content"])
+    elif "file_path" in case:
+        file_path = os.path.join(os.path.dirname(__file__), "fixtures", case["file_path"])
+        file_name = os.path.basename(file_path)
+    else:
+        raise ValueError(f"No file content or path provided for case {case['case_id']}")
+
+    file_response = _upload_file(openai_client, file_name, file_path)
+
+    # Attach our file to the vector store
+    file_attach_response = openai_client.vector_stores.files.create(
+        vector_store_id=vector_store.id,
+        file_id=file_response.id,
+    )
+
+    # Wait for the file to be attached
+    while file_attach_response.status == "in_progress":
+        time.sleep(0.1)
+        file_attach_response = openai_client.vector_stores.files.retrieve(
+            vector_store_id=vector_store.id,
+            file_id=file_response.id,
+        )
+    assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}"
+    assert not file_attach_response.last_error
+
+    # Update our tools with the right vector store id
+    tools = case["tools"]
+    for tool in tools:
+        if tool["type"] == "file_search":
+            tool["vector_store_ids"] = [vector_store.id]
+
+    # Create the response request, which should query our vector store
+    response = openai_client.responses.create(
+        model=model,
+        input=case["input"],
+        tools=tools,
+        stream=False,
+        include=["file_search_call.results"],
+    )
+
+    # Verify the file_search_tool was called
+    assert len(response.output) > 1
+    assert response.output[0].type == "file_search_call"
+    assert response.output[0].status == "completed"
+    assert response.output[0].queries  # ensure it's some non-empty list
+    assert response.output[0].results
+    assert case["output"].lower() in response.output[0].results[0].text.lower()
+    assert response.output[0].results[0].score > 0
+
+    # Verify the output_text generated by the response
+    assert case["output"].lower() in response.output_text.lower().strip()
+
+
+def test_response_non_streaming_file_search_empty_vector_store(
+    request, openai_client, model, provider, verification_config
+):
+    if isinstance(openai_client, LlamaStackAsLibraryClient):
+        pytest.skip("Responses API file search is not yet supported in library client.")
+
+    test_name_base = get_base_test_name(request)
+    if should_skip_test(verification_config, provider, model, test_name_base):
+        pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+    vector_store = _new_vector_store(openai_client, "test_vector_store")
+
+    # Create the response request, which should query our vector store
+    response = openai_client.responses.create(
+        model=model,
+        input="How many experts does the Llama 4 Maverick model have?",
+        tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
+        stream=False,
+        include=["file_search_call.results"],
+    )
+
+    # Verify the file_search_tool was called
+    assert len(response.output) > 1
+    assert response.output[0].type == "file_search_call"
+    assert response.output[0].status == "completed"
+    assert response.output[0].queries  # ensure it's some non-empty list
+    assert not response.output[0].results  # ensure we don't get any results
+
+    # Verify some output_text was generated by the response
+    assert response.output_text
+
+
@pytest.mark.parametrize(
    "case",
    responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],