diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 96de04ec9..fddce0c57 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -3240,6 +3240,59 @@
}
}
},
+ "/v1/openai/v1/vector_stores/{vector_store_id}/files": {
+ "post": {
+ "responses": {
+ "200": {
+ "description": "A VectorStoreFileObject representing the attached file.",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/VectorStoreFileObject"
+ }
+ }
+ }
+ },
+ "400": {
+ "$ref": "#/components/responses/BadRequest400"
+ },
+ "429": {
+ "$ref": "#/components/responses/TooManyRequests429"
+ },
+ "500": {
+ "$ref": "#/components/responses/InternalServerError500"
+ },
+ "default": {
+ "$ref": "#/components/responses/DefaultError"
+ }
+ },
+ "tags": [
+ "VectorIO"
+ ],
+ "description": "Attach a file to a vector store.",
+ "parameters": [
+ {
+ "name": "vector_store_id",
+ "in": "path",
+ "description": "The ID of the vector store to attach the file to.",
+ "required": true,
+ "schema": {
+ "type": "string"
+ }
+ }
+ ],
+ "requestBody": {
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/OpenaiAttachFileToVectorStoreRequest"
+ }
+ }
+ },
+ "required": true
+ }
+ }
+ },
"/v1/openai/v1/completions": {
"post": {
"responses": {
@@ -7047,6 +7100,9 @@
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
},
+ {
+ "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+ },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
},
@@ -7193,12 +7249,41 @@
"const": "file_search",
"default": "file_search"
},
- "vector_store_id": {
+ "vector_store_ids": {
"type": "array",
"items": {
"type": "string"
}
},
+ "filters": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "max_num_results": {
+ "type": "integer",
+ "default": 10
+ },
"ranking_options": {
"type": "object",
"properties": {
@@ -7217,7 +7302,7 @@
"additionalProperties": false,
"required": [
"type",
- "vector_store_id"
+ "vector_store_ids"
],
"title": "OpenAIResponseInputToolFileSearch"
},
@@ -7484,6 +7569,64 @@
],
"title": "OpenAIResponseOutputMessageContentOutputText"
},
+ "OpenAIResponseOutputMessageFileSearchToolCall": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string"
+ },
+ "queries": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "status": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "file_search_call",
+ "default": "file_search_call"
+ },
+ "results": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "queries",
+ "status",
+ "type"
+ ],
+ "title": "OpenAIResponseOutputMessageFileSearchToolCall"
+ },
"OpenAIResponseOutputMessageFunctionToolCall": {
"type": "object",
"properties": {
@@ -7760,6 +7903,9 @@
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
},
+ {
+ "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+ },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
},
@@ -7775,6 +7921,7 @@
"mapping": {
"message": "#/components/schemas/OpenAIResponseMessage",
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
+ "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
@@ -11766,6 +11913,232 @@
],
"title": "LogEventRequest"
},
+ "VectorStoreChunkingStrategy": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/VectorStoreChunkingStrategyAuto"
+ },
+ {
+ "$ref": "#/components/schemas/VectorStoreChunkingStrategyStatic"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "auto": "#/components/schemas/VectorStoreChunkingStrategyAuto",
+ "static": "#/components/schemas/VectorStoreChunkingStrategyStatic"
+ }
+ }
+ },
+ "VectorStoreChunkingStrategyAuto": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "auto",
+ "default": "auto"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type"
+ ],
+ "title": "VectorStoreChunkingStrategyAuto"
+ },
+ "VectorStoreChunkingStrategyStatic": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "static",
+ "default": "static"
+ },
+ "static": {
+ "$ref": "#/components/schemas/VectorStoreChunkingStrategyStaticConfig"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "static"
+ ],
+ "title": "VectorStoreChunkingStrategyStatic"
+ },
+ "VectorStoreChunkingStrategyStaticConfig": {
+ "type": "object",
+ "properties": {
+ "chunk_overlap_tokens": {
+ "type": "integer",
+ "default": 400
+ },
+ "max_chunk_size_tokens": {
+ "type": "integer",
+ "default": 800
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "chunk_overlap_tokens",
+ "max_chunk_size_tokens"
+ ],
+ "title": "VectorStoreChunkingStrategyStaticConfig"
+ },
+ "OpenaiAttachFileToVectorStoreRequest": {
+ "type": "object",
+ "properties": {
+ "file_id": {
+ "type": "string",
+ "description": "The ID of the file to attach to the vector store."
+ },
+ "attributes": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ },
+ "description": "The key-value attributes stored with the file, which can be used for filtering."
+ },
+ "chunking_strategy": {
+ "$ref": "#/components/schemas/VectorStoreChunkingStrategy",
+ "description": "The chunking strategy to use for the file."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "file_id"
+ ],
+ "title": "OpenaiAttachFileToVectorStoreRequest"
+ },
+ "VectorStoreFileLastError": {
+ "type": "object",
+ "properties": {
+ "code": {
+ "oneOf": [
+ {
+ "type": "string",
+ "const": "server_error"
+ },
+ {
+ "type": "string",
+ "const": "rate_limit_exceeded"
+ }
+ ]
+ },
+ "message": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "code",
+ "message"
+ ],
+ "title": "VectorStoreFileLastError"
+ },
+ "VectorStoreFileObject": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string"
+ },
+ "object": {
+ "type": "string",
+ "default": "vector_store.file"
+ },
+ "attributes": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "chunking_strategy": {
+ "$ref": "#/components/schemas/VectorStoreChunkingStrategy"
+ },
+ "created_at": {
+ "type": "integer"
+ },
+ "last_error": {
+ "$ref": "#/components/schemas/VectorStoreFileLastError"
+ },
+ "status": {
+ "oneOf": [
+ {
+ "type": "string",
+ "const": "completed"
+ },
+ {
+ "type": "string",
+ "const": "in_progress"
+ },
+ {
+ "type": "string",
+ "const": "cancelled"
+ },
+ {
+ "type": "string",
+ "const": "failed"
+ }
+ ]
+ },
+ "usage_bytes": {
+ "type": "integer",
+ "default": 0
+ },
+ "vector_store_id": {
+ "type": "string"
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "object",
+ "attributes",
+ "chunking_strategy",
+ "created_at",
+ "status",
+ "usage_bytes",
+ "vector_store_id"
+ ],
+ "title": "VectorStoreFileObject",
+ "description": "OpenAI Vector Store File object."
+ },
"OpenAIJSONSchema": {
"type": "object",
"properties": {
@@ -12404,6 +12777,10 @@
},
"prompt_logprobs": {
"type": "integer"
+ },
+ "suffix": {
+ "type": "string",
+ "description": "(Optional) The suffix that should be appended to the completion."
}
},
"additionalProperties": false,
@@ -13621,7 +13998,11 @@
},
"mode": {
"type": "string",
- "description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"."
+ "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
+ },
+ "ranker": {
+ "$ref": "#/components/schemas/Ranker",
+ "description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
}
},
"additionalProperties": false,
@@ -13651,6 +14032,69 @@
}
}
},
+ "RRFRanker": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "rrf",
+ "default": "rrf",
+ "description": "The type of ranker, always \"rrf\""
+ },
+ "impact_factor": {
+ "type": "number",
+ "default": 60.0,
+ "description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009)."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "impact_factor"
+ ],
+ "title": "RRFRanker",
+ "description": "Reciprocal Rank Fusion (RRF) ranker configuration."
+ },
+ "Ranker": {
+ "oneOf": [
+ {
+ "$ref": "#/components/schemas/RRFRanker"
+ },
+ {
+ "$ref": "#/components/schemas/WeightedRanker"
+ }
+ ],
+ "discriminator": {
+ "propertyName": "type",
+ "mapping": {
+ "rrf": "#/components/schemas/RRFRanker",
+ "weighted": "#/components/schemas/WeightedRanker"
+ }
+ }
+ },
+ "WeightedRanker": {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "const": "weighted",
+ "default": "weighted",
+ "description": "The type of ranker, always \"weighted\""
+ },
+ "alpha": {
+ "type": "number",
+ "default": 0.5,
+ "description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "type",
+ "alpha"
+ ],
+ "title": "WeightedRanker",
+ "description": "Weighted ranker configuration that combines vector and keyword scores."
+ },
"QueryRequest": {
"type": "object",
"properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b2fe870be..49388939f 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -2263,6 +2263,43 @@ paths:
schema:
$ref: '#/components/schemas/LogEventRequest'
required: true
+ /v1/openai/v1/vector_stores/{vector_store_id}/files:
+ post:
+ responses:
+ '200':
+ description: >-
+ A VectorStoreFileObject representing the attached file.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/VectorStoreFileObject'
+ '400':
+ $ref: '#/components/responses/BadRequest400'
+ '429':
+ $ref: >-
+ #/components/responses/TooManyRequests429
+ '500':
+ $ref: >-
+ #/components/responses/InternalServerError500
+ default:
+ $ref: '#/components/responses/DefaultError'
+ tags:
+ - VectorIO
+ description: Attach a file to a vector store.
+ parameters:
+ - name: vector_store_id
+ in: path
+ description: >-
+ The ID of the vector store to attach the file to.
+ required: true
+ schema:
+ type: string
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/OpenaiAttachFileToVectorStoreRequest'
+ required: true
/v1/openai/v1/completions:
post:
responses:
@@ -5021,6 +5058,7 @@ components:
OpenAIResponseInput:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMessage'
@@ -5115,10 +5153,23 @@ components:
type: string
const: file_search
default: file_search
- vector_store_id:
+ vector_store_ids:
type: array
items:
type: string
+ filters:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ max_num_results:
+ type: integer
+ default: 10
ranking_options:
type: object
properties:
@@ -5132,7 +5183,7 @@ components:
additionalProperties: false
required:
- type
- - vector_store_id
+ - vector_store_ids
title: OpenAIResponseInputToolFileSearch
OpenAIResponseInputToolFunction:
type: object
@@ -5294,6 +5345,41 @@ components:
- type
title: >-
OpenAIResponseOutputMessageContentOutputText
+ "OpenAIResponseOutputMessageFileSearchToolCall":
+ type: object
+ properties:
+ id:
+ type: string
+ queries:
+ type: array
+ items:
+ type: string
+ status:
+ type: string
+ type:
+ type: string
+ const: file_search_call
+ default: file_search_call
+ results:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - id
+ - queries
+ - status
+ - type
+ title: >-
+ OpenAIResponseOutputMessageFileSearchToolCall
"OpenAIResponseOutputMessageFunctionToolCall":
type: object
properties:
@@ -5491,6 +5577,7 @@ components:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseMessage'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@@ -5499,6 +5586,7 @@ components:
mapping:
message: '#/components/schemas/OpenAIResponseMessage'
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@@ -8251,6 +8339,148 @@ components:
- event
- ttl_seconds
title: LogEventRequest
+ VectorStoreChunkingStrategy:
+ oneOf:
+ - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+ - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+ discriminator:
+ propertyName: type
+ mapping:
+ auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+ static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+ VectorStoreChunkingStrategyAuto:
+ type: object
+ properties:
+ type:
+ type: string
+ const: auto
+ default: auto
+ additionalProperties: false
+ required:
+ - type
+ title: VectorStoreChunkingStrategyAuto
+ VectorStoreChunkingStrategyStatic:
+ type: object
+ properties:
+ type:
+ type: string
+ const: static
+ default: static
+ static:
+ $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
+ additionalProperties: false
+ required:
+ - type
+ - static
+ title: VectorStoreChunkingStrategyStatic
+ VectorStoreChunkingStrategyStaticConfig:
+ type: object
+ properties:
+ chunk_overlap_tokens:
+ type: integer
+ default: 400
+ max_chunk_size_tokens:
+ type: integer
+ default: 800
+ additionalProperties: false
+ required:
+ - chunk_overlap_tokens
+ - max_chunk_size_tokens
+ title: VectorStoreChunkingStrategyStaticConfig
+ OpenaiAttachFileToVectorStoreRequest:
+ type: object
+ properties:
+ file_id:
+ type: string
+ description: >-
+ The ID of the file to attach to the vector store.
+ attributes:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ description: >-
+ The key-value attributes stored with the file, which can be used for filtering.
+ chunking_strategy:
+ $ref: '#/components/schemas/VectorStoreChunkingStrategy'
+ description: >-
+ The chunking strategy to use for the file.
+ additionalProperties: false
+ required:
+ - file_id
+ title: OpenaiAttachFileToVectorStoreRequest
+ VectorStoreFileLastError:
+ type: object
+ properties:
+ code:
+ oneOf:
+ - type: string
+ const: server_error
+ - type: string
+ const: rate_limit_exceeded
+ message:
+ type: string
+ additionalProperties: false
+ required:
+ - code
+ - message
+ title: VectorStoreFileLastError
+ VectorStoreFileObject:
+ type: object
+ properties:
+ id:
+ type: string
+ object:
+ type: string
+ default: vector_store.file
+ attributes:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ chunking_strategy:
+ $ref: '#/components/schemas/VectorStoreChunkingStrategy'
+ created_at:
+ type: integer
+ last_error:
+ $ref: '#/components/schemas/VectorStoreFileLastError'
+ status:
+ oneOf:
+ - type: string
+ const: completed
+ - type: string
+ const: in_progress
+ - type: string
+ const: cancelled
+ - type: string
+ const: failed
+ usage_bytes:
+ type: integer
+ default: 0
+ vector_store_id:
+ type: string
+ additionalProperties: false
+ required:
+ - id
+ - object
+ - attributes
+ - chunking_strategy
+ - created_at
+ - status
+ - usage_bytes
+ - vector_store_id
+ title: VectorStoreFileObject
+ description: OpenAI Vector Store File object.
OpenAIJSONSchema:
type: object
properties:
@@ -8673,6 +8903,10 @@ components:
type: string
prompt_logprobs:
type: integer
+ suffix:
+ type: string
+ description: >-
+ (Optional) The suffix that should be appended to the completion.
additionalProperties: false
required:
- model
@@ -9526,7 +9760,13 @@ components:
mode:
type: string
description: >-
- Search mode for retrieval—either "vector" or "keyword". Default "vector".
+ Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
+ "vector".
+ ranker:
+ $ref: '#/components/schemas/Ranker'
+ description: >-
+ Configuration for the ranker to use in hybrid search. Defaults to RRF
+ ranker.
additionalProperties: false
required:
- query_generator_config
@@ -9545,6 +9785,58 @@ components:
mapping:
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
+ RRFRanker:
+ type: object
+ properties:
+ type:
+ type: string
+ const: rrf
+ default: rrf
+ description: The type of ranker, always "rrf"
+ impact_factor:
+ type: number
+ default: 60.0
+ description: >-
+ The impact factor for RRF scoring. Higher values give more weight to higher-ranked
+ results. Must be greater than 0. Default of 60 is from the original RRF
+ paper (Cormack et al., 2009).
+ additionalProperties: false
+ required:
+ - type
+ - impact_factor
+ title: RRFRanker
+ description: >-
+ Reciprocal Rank Fusion (RRF) ranker configuration.
+ Ranker:
+ oneOf:
+ - $ref: '#/components/schemas/RRFRanker'
+ - $ref: '#/components/schemas/WeightedRanker'
+ discriminator:
+ propertyName: type
+ mapping:
+ rrf: '#/components/schemas/RRFRanker'
+ weighted: '#/components/schemas/WeightedRanker'
+ WeightedRanker:
+ type: object
+ properties:
+ type:
+ type: string
+ const: weighted
+ default: weighted
+ description: The type of ranker, always "weighted"
+ alpha:
+ type: number
+ default: 0.5
+ description: >-
+ Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
+ only use vector scores, values in between blend both scores.
+ additionalProperties: false
+ required:
+ - type
+ - alpha
+ title: WeightedRanker
+ description: >-
+ Weighted ranker configuration that combines vector and keyword scores.
QueryRequest:
type: object
properties:
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 4d148feda..e09c79359 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -18,6 +18,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
+| files | `inline::localfs` |
| inference | `remote::ollama` |
| post_training | `inline::huggingface` |
| safety | `inline::llama-guard` |
diff --git a/docs/source/providers/vector_io/sqlite-vec.md b/docs/source/providers/vector_io/sqlite-vec.md
index 49ba659f7..3c7c4cbee 100644
--- a/docs/source/providers/vector_io/sqlite-vec.md
+++ b/docs/source/providers/vector_io/sqlite-vec.md
@@ -66,25 +66,126 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
2. Configure your Llama Stack project to use SQLite-Vec.
3. Start storing and querying vectors.
-## Supported Search Modes
+The SQLite-vec provider supports three search modes:
-The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
-
-When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
-`RAGQueryConfig`. For example:
+1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
+2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
+3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
+Example with hybrid search:
```python
-from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
+response = await vector_io.query_chunks(
+ vector_db_id="my_db",
+ query="your query here",
+ params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
+)
-query_config = RAGQueryConfig(max_chunks=6, mode="vector")
+# Using RRF ranker
+response = await vector_io.query_chunks(
+ vector_db_id="my_db",
+ query="your query here",
+ params={
+ "mode": "hybrid",
+ "max_chunks": 3,
+ "score_threshold": 0.7,
+ "ranker": {"type": "rrf", "impact_factor": 60.0},
+ },
+)
-results = client.tool_runtime.rag_tool.query(
- vector_db_ids=[vector_db_id],
- content="what is torchtune",
- query_config=query_config,
+# Using weighted ranker
+response = await vector_io.query_chunks(
+ vector_db_id="my_db",
+ query="your query here",
+ params={
+ "mode": "hybrid",
+ "max_chunks": 3,
+ "score_threshold": 0.7,
+ "ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
+ },
)
```
+Example with explicit vector search:
+```python
+response = await vector_io.query_chunks(
+ vector_db_id="my_db",
+ query="your query here",
+ params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
+)
+```
+
+Example with keyword search:
+```python
+response = await vector_io.query_chunks(
+ vector_db_id="my_db",
+ query="your query here",
+ params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
+)
+```
+
+## Supported Search Modes
+
+The SQLite vector store supports three search modes:
+
+1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
+2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
+3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
+
+### Hybrid Search
+
+Hybrid search combines the strengths of both vector and keyword search by:
+- Computing vector similarity scores
+- Computing keyword match scores
+- Using a ranker to combine these scores
+
+Two ranker types are supported:
+
+1. **RRF (Reciprocal Rank Fusion)**:
+ - Combines ranks from both vector and keyword results
+ - Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
+ - Good for balancing between vector and keyword results
+ - The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
+
+2. **Weighted**:
+ - Linearly combines normalized vector and keyword scores
+ - Uses an alpha parameter (0-1) to control the blend:
+ - alpha=0: Only use keyword scores
+ - alpha=1: Only use vector scores
+ - alpha=0.5: Equal weight to both (default)
+
+Example using RAGQueryConfig with different search modes:
+
+```python
+from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
+
+# Vector search
+config = RAGQueryConfig(mode="vector", max_chunks=5)
+
+# Keyword search
+config = RAGQueryConfig(mode="keyword", max_chunks=5)
+
+# Hybrid search with custom RRF ranker
+config = RAGQueryConfig(
+ mode="hybrid",
+ max_chunks=5,
+ ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
+)
+
+# Hybrid search with weighted ranker
+config = RAGQueryConfig(
+ mode="hybrid",
+ max_chunks=5,
+ ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
+)
+
+# Hybrid search with default RRF ranker
+config = RAGQueryConfig(
+ mode="hybrid", max_chunks=5
+) # Will use RRF with impact_factor=60.0
+```
+
+Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
+
## Installation
You can install SQLite-Vec using pip:
@@ -96,3 +197,5 @@ pip install sqlite-vec
## Documentation
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
+
+[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
index 35b3d5ace..2e1cb257a 100644
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
type: Literal["web_search_call"] = "web_search_call"
+@json_schema_type
+class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
+ id: str
+ queries: list[str]
+ status: str
+ type: Literal["file_search_call"] = "file_search_call"
+ results: list[dict[str, Any]] | None = None
+
+
@json_schema_type
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
call_id: str
@@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel):
OpenAIResponseOutput = Annotated[
OpenAIResponseMessage
| OpenAIResponseOutputMessageWebSearchToolCall
+ | OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseOutputMessageMCPCall
| OpenAIResponseOutputMessageMCPListTools,
@@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
OpenAIResponseInput = Annotated[
# Responses API allows output messages to be passed in as input
OpenAIResponseOutputMessageWebSearchToolCall
+ | OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseInputFunctionToolCallOutput
|
@@ -397,9 +408,10 @@ class FileSearchRankingOptions(BaseModel):
@json_schema_type
class OpenAIResponseInputToolFileSearch(BaseModel):
type: Literal["file_search"] = "file_search"
- vector_store_id: list[str]
+ vector_store_ids: list[str]
+ filters: dict[str, Any] | None = None
+ max_num_results: int | None = Field(default=10, ge=1, le=50)
ranking_options: FileSearchRankingOptions | None = None
- # TODO: add filters
class ApprovalFilter(BaseModel):
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 74697dd18..c440794f3 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -1038,6 +1038,8 @@ class InferenceProvider(Protocol):
# vLLM-specific parameters
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ # for fill-in-the-middle type completion
+ suffix: str | None = None,
) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
@@ -1058,6 +1060,7 @@ class InferenceProvider(Protocol):
:param temperature: (Optional) The temperature to use.
:param top_p: (Optional) The top p to use.
:param user: (Optional) The user to use.
+ :param suffix: (Optional) The suffix that should be appended to the completion.
:returns: An OpenAICompletion.
"""
...
diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py
index 1e3542f74..72f68b7cb 100644
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@@ -15,6 +15,48 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+@json_schema_type
+class RRFRanker(BaseModel):
+ """
+ Reciprocal Rank Fusion (RRF) ranker configuration.
+
+ :param type: The type of ranker, always "rrf"
+ :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
+ Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009).
+ """
+
+ type: Literal["rrf"] = "rrf"
+ impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance
+
+
+@json_schema_type
+class WeightedRanker(BaseModel):
+ """
+ Weighted ranker configuration that combines vector and keyword scores.
+
+ :param type: The type of ranker, always "weighted"
+ :param alpha: Weight factor between 0 and 1.
+ 0 means only use keyword scores,
+ 1 means only use vector scores,
+ values in between blend both scores.
+ """
+
+ type: Literal["weighted"] = "weighted"
+ alpha: float = Field(
+ default=0.5,
+ ge=0.0,
+ le=1.0,
+ description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
+ )
+
+
+Ranker = Annotated[
+ RRFRanker | WeightedRanker,
+ Field(discriminator="type"),
+]
+register_schema(Ranker, name="Ranker")
+
+
@json_schema_type
class RAGDocument(BaseModel):
"""
@@ -76,7 +118,8 @@ class RAGQueryConfig(BaseModel):
:param chunk_template: Template for formatting each retrieved chunk in the context.
Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
- :param mode: Search mode for retrieval—either "vector" or "keyword". Default "vector".
+ :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
+ :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
"""
# This config defines how a query is generated using the messages
@@ -86,6 +129,7 @@ class RAGQueryConfig(BaseModel):
max_chunks: int = 5
chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
mode: str | None = None
+ ranker: Ranker | None = Field(default=None) # Only used for hybrid mode
@field_validator("chunk_template")
def validate_chunk_template(cls, v: str) -> str:
diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py
index 1c8ae4dab..77d4cfc5a 100644
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@@ -8,7 +8,7 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-from typing import Any, Literal, Protocol, runtime_checkable
+from typing import Annotated, Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, Field
@@ -16,6 +16,7 @@ from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod
+from llama_stack.strong_typing.schema import register_schema
class Chunk(BaseModel):
@@ -133,6 +134,50 @@ class VectorStoreDeleteResponse(BaseModel):
deleted: bool = True
+@json_schema_type
+class VectorStoreChunkingStrategyAuto(BaseModel):
+ type: Literal["auto"] = "auto"
+
+
+@json_schema_type
+class VectorStoreChunkingStrategyStaticConfig(BaseModel):
+ chunk_overlap_tokens: int = 400
+ max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
+
+
+@json_schema_type
+class VectorStoreChunkingStrategyStatic(BaseModel):
+ type: Literal["static"] = "static"
+ static: VectorStoreChunkingStrategyStaticConfig
+
+
+VectorStoreChunkingStrategy = Annotated[
+ VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic, Field(discriminator="type")
+]
+register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
+
+
+@json_schema_type
+class VectorStoreFileLastError(BaseModel):
+ code: Literal["server_error"] | Literal["rate_limit_exceeded"]
+ message: str
+
+
+@json_schema_type
+class VectorStoreFileObject(BaseModel):
+ """OpenAI Vector Store File object."""
+
+ id: str
+ object: str = "vector_store.file"
+ attributes: dict[str, Any] = Field(default_factory=dict)
+ chunking_strategy: VectorStoreChunkingStrategy
+ created_at: int
+ last_error: VectorStoreFileLastError | None = None
+ status: Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
+ usage_bytes: int = 0
+ vector_store_id: str
+
+
class VectorDBStore(Protocol):
def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...
@@ -290,3 +335,21 @@ class VectorIO(Protocol):
:returns: A VectorStoreSearchResponse containing the search results.
"""
...
+
+ @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST")
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ """Attach a file to a vector store.
+
+ :param vector_store_id: The ID of the vector store to attach the file to.
+ :param file_id: The ID of the file to attach to the vector store.
+ :param attributes: The key-value attributes stored with the file, which can be used for filtering.
+ :param chunking_strategy: The chunking strategy to use for the file.
+ :returns: A VectorStoreFileObject representing the attached file.
+ """
+ ...
diff --git a/llama_stack/distribution/routers/inference.py b/llama_stack/distribution/routers/inference.py
index 62d04cdc4..4e0a33b59 100644
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@@ -426,6 +426,7 @@ class InferenceRouter(Inference):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
logger.debug(
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
@@ -456,6 +457,7 @@ class InferenceRouter(Inference):
user=user,
guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs,
+ suffix=suffix,
)
provider = self.routing_table.get_provider_impl(model_obj.identifier)
diff --git a/llama_stack/distribution/routers/vector_io.py b/llama_stack/distribution/routers/vector_io.py
index 3d65aef24..8eb56b7ca 100644
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@@ -19,6 +19,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject,
VectorStoreSearchResponsePage,
)
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable
@@ -254,3 +255,20 @@ class VectorIORouter(VectorIO):
ranking_options=ranking_options,
rewrite_query=rewrite_query,
)
+
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
+ # Route based on vector store ID
+ provider = self.routing_table.get_provider_impl(vector_store_id)
+ return await provider.openai_attach_file_to_vector_store(
+ vector_store_id=vector_store_id,
+ file_id=file_id,
+ attributes=attributes,
+ chunking_strategy=chunking_strategy,
+ )
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index 0ff6dc2c5..33fcbfa5d 100644
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInputMessageContentImage,
OpenAIResponseInputMessageContentText,
OpenAIResponseInputTool,
+ OpenAIResponseInputToolFileSearch,
OpenAIResponseInputToolMCP,
OpenAIResponseMessage,
OpenAIResponseObject,
@@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseOutput,
OpenAIResponseOutputMessageContent,
OpenAIResponseOutputMessageContentOutputText,
+ OpenAIResponseOutputMessageFileSearchToolCall,
OpenAIResponseOutputMessageFunctionToolCall,
OpenAIResponseOutputMessageMCPListTools,
OpenAIResponseOutputMessageWebSearchToolCall,
@@ -62,7 +64,7 @@ from llama_stack.apis.inference.inference import (
OpenAIToolMessageParam,
OpenAIUserMessageParam,
)
-from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime
+from llama_stack.apis.tools import RAGQueryConfig, ToolGroups, ToolRuntime
from llama_stack.log import get_logger
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
@@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
class ChatCompletionContext(BaseModel):
model: str
messages: list[OpenAIMessageParam]
- tools: list[ChatCompletionToolParam] | None = None
+ response_tools: list[OpenAIResponseInputTool] | None = None
+ chat_tools: list[ChatCompletionToolParam] | None = None
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
temperature: float | None
response_format: OpenAIResponseFormatParam
@@ -388,7 +391,8 @@ class OpenAIResponsesImpl:
ctx = ChatCompletionContext(
model=model,
messages=messages,
- tools=chat_tools,
+ response_tools=tools,
+ chat_tools=chat_tools,
mcp_tool_to_server=mcp_tool_to_server,
temperature=temperature,
response_format=response_format,
@@ -417,7 +421,7 @@ class OpenAIResponsesImpl:
completion_result = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=messages,
- tools=ctx.tools,
+ tools=ctx.chat_tools,
stream=True,
temperature=ctx.temperature,
response_format=ctx.response_format,
@@ -606,6 +610,12 @@ class OpenAIResponsesImpl:
if not tool:
raise ValueError(f"Tool {tool_name} not found")
chat_tools.append(make_openai_tool(tool_name, tool))
+ elif input_tool.type == "file_search":
+ tool_name = "knowledge_search"
+ tool = await self.tool_groups_api.get_tool(tool_name)
+ if not tool:
+ raise ValueError(f"Tool {tool_name} not found")
+ chat_tools.append(make_openai_tool(tool_name, tool))
elif input_tool.type == "mcp":
always_allowed = None
never_allowed = None
@@ -667,6 +677,7 @@ class OpenAIResponsesImpl:
tool_call_id = tool_call.id
function = tool_call.function
+ tool_kwargs = json.loads(function.arguments) if function.arguments else {}
if not function or not tool_call_id or not function.name:
return None, None
@@ -680,12 +691,26 @@ class OpenAIResponsesImpl:
endpoint=mcp_tool.server_url,
headers=mcp_tool.headers or {},
tool_name=function.name,
- kwargs=json.loads(function.arguments) if function.arguments else {},
+ kwargs=tool_kwargs,
)
else:
+ if function.name == "knowledge_search":
+ response_file_search_tool = next(
+ t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)
+ )
+ if response_file_search_tool:
+ if response_file_search_tool.filters:
+ logger.warning("Filters are not yet supported for file_search tool")
+ if response_file_search_tool.ranking_options:
+ logger.warning("Ranking options are not yet supported for file_search tool")
+ tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids
+ tool_kwargs["query_config"] = RAGQueryConfig(
+ mode="vector",
+ max_chunks=response_file_search_tool.max_num_results,
+ )
result = await self.tool_runtime_api.invoke_tool(
tool_name=function.name,
- kwargs=json.loads(function.arguments) if function.arguments else {},
+ kwargs=tool_kwargs,
)
except Exception as e:
error_exc = e
@@ -713,6 +738,27 @@ class OpenAIResponsesImpl:
)
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
message.status = "failed"
+ elif function.name == "knowledge_search":
+ message = OpenAIResponseOutputMessageFileSearchToolCall(
+ id=tool_call_id,
+ queries=[tool_kwargs.get("query", "")],
+ status="completed",
+ )
+ if "document_ids" in result.metadata:
+ message.results = []
+ for i, doc_id in enumerate(result.metadata["document_ids"]):
+ text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
+ score = result.metadata["scores"][i] if "scores" in result.metadata else None
+ message.results.append(
+ {
+ "file_id": doc_id,
+ "filename": doc_id,
+ "text": text,
+ "score": score,
+ }
+ )
+ if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
+ message.status = "failed"
else:
raise ValueError(f"Unknown tool {function.name} called")
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index 4776d47d0..7f4fe5dbd 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -121,8 +121,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
vector_db_id=vector_db_id,
query=query,
params={
- "max_chunks": query_config.max_chunks,
"mode": query_config.mode,
+ "max_chunks": query_config.max_chunks,
+ "score_threshold": 0.0,
+ "ranker": query_config.ranker,
},
)
for vector_db_id in vector_db_ids
@@ -170,6 +172,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
content=picked,
metadata={
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
+ "chunks": [c.content for c in chunks[: len(picked)]],
+ "scores": scores[: len(picked)],
},
)
diff --git a/llama_stack/providers/inline/vector_io/faiss/__init__.py b/llama_stack/providers/inline/vector_io/faiss/__init__.py
index 68a1dee66..dd1c59b7b 100644
--- a/llama_stack/providers/inline/vector_io/faiss/__init__.py
+++ b/llama_stack/providers/inline/vector_io/faiss/__init__.py
@@ -16,6 +16,6 @@ async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]):
assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
- impl = FaissVectorIOAdapter(config, deps[Api.inference])
+ impl = FaissVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
await impl.initialize()
return impl
diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 5e9155011..a2f4417e0 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -15,6 +15,7 @@ import faiss
import numpy as np
from numpy.typing import NDArray
+from llama_stack.apis.files import Files
from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.inference.inference import Inference
from llama_stack.apis.vector_dbs import VectorDB
@@ -130,11 +131,23 @@ class FaissIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in FAISS")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in FAISS")
+
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
- def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None:
+ def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
self.config = config
self.inference_api = inference_api
+ self.files_api = files_api
self.cache: dict[str, VectorDBWithIndex] = {}
self.kvstore: KVStore | None = None
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
index 6db176eda..e5200a755 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
@@ -15,6 +15,6 @@ async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]):
from .sqlite_vec import SQLiteVecVectorIOAdapter
assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
- impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference])
+ impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
await impl.initialize()
return impl
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
index 02f04e766..c6712882a 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@@ -17,6 +17,7 @@ import numpy as np
import sqlite_vec
from numpy.typing import NDArray
+from llama_stack.apis.files.files import Files
from llama_stack.apis.inference.inference import Inference
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import (
@@ -26,14 +27,20 @@ from llama_stack.apis.vector_io import (
)
from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex
+from llama_stack.providers.utils.memory.vector_store import (
+ RERANKER_TYPE_RRF,
+ RERANKER_TYPE_WEIGHTED,
+ EmbeddingIndex,
+ VectorDBWithIndex,
+)
logger = logging.getLogger(__name__)
# Specifying search mode is dependent on the VectorIO provider.
VECTOR_SEARCH = "vector"
KEYWORD_SEARCH = "keyword"
-SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH}
+HYBRID_SEARCH = "hybrid"
+SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH}
def serialize_vector(vector: list[float]) -> bytes:
@@ -50,6 +57,59 @@ def _create_sqlite_connection(db_path):
return connection
+def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
+ """Normalize scores to [0,1] range using min-max normalization."""
+ if not scores:
+ return {}
+ min_score = min(scores.values())
+ max_score = max(scores.values())
+ score_range = max_score - min_score
+ if score_range > 0:
+ return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
+ return {doc_id: 1.0 for doc_id in scores}
+
+
+def _weighted_rerank(
+ vector_scores: dict[str, float],
+ keyword_scores: dict[str, float],
+ alpha: float = 0.5,
+) -> dict[str, float]:
+ """ReRanker that uses weighted average of scores."""
+ all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+ normalized_vector_scores = _normalize_scores(vector_scores)
+ normalized_keyword_scores = _normalize_scores(keyword_scores)
+
+ return {
+ doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
+ + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
+ for doc_id in all_ids
+ }
+
+
+def _rrf_rerank(
+ vector_scores: dict[str, float],
+ keyword_scores: dict[str, float],
+ impact_factor: float = 60.0,
+) -> dict[str, float]:
+ """ReRanker that uses Reciprocal Rank Fusion."""
+ # Convert scores to ranks
+ vector_ranks = {
+ doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
+ }
+ keyword_ranks = {
+ doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
+ }
+
+ all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+ rrf_scores = {}
+ for doc_id in all_ids:
+ vector_rank = vector_ranks.get(doc_id, float("inf"))
+ keyword_rank = keyword_ranks.get(doc_id, float("inf"))
+ # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
+ rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
+ return rrf_scores
+
+
class SQLiteVecIndex(EmbeddingIndex):
"""
An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@@ -254,8 +314,6 @@ class SQLiteVecIndex(EmbeddingIndex):
"""
Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
"""
- if query_string is None:
- raise ValueError("query_string is required for keyword search.")
def _execute_query():
connection = _create_sqlite_connection(self.db_path)
@@ -293,6 +351,81 @@ class SQLiteVecIndex(EmbeddingIndex):
scores.append(score)
return QueryChunksResponse(chunks=chunks, scores=scores)
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str = RERANKER_TYPE_RRF,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ """
+ Hybrid search using a configurable re-ranking strategy.
+
+ Args:
+ embedding: The query embedding vector
+ query_string: The text query for keyword search
+ k: Number of results to return
+ score_threshold: Minimum similarity score threshold
+ reranker_type: Type of reranker to use ("rrf" or "weighted")
+ reranker_params: Parameters for the reranker
+
+ Returns:
+ QueryChunksResponse with combined results
+ """
+ if reranker_params is None:
+ reranker_params = {}
+
+ # Get results from both search methods
+ vector_response = await self.query_vector(embedding, k, score_threshold)
+ keyword_response = await self.query_keyword(query_string, k, score_threshold)
+
+ # Convert responses to score dictionaries using generate_chunk_id
+ vector_scores = {
+ generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
+ for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
+ }
+ keyword_scores = {
+ generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
+ for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
+ }
+
+ # Combine scores using the specified reranker
+ if reranker_type == RERANKER_TYPE_WEIGHTED:
+ alpha = reranker_params.get("alpha", 0.5)
+ combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
+ else:
+ # Default to RRF for None, RRF, or any unknown types
+ impact_factor = reranker_params.get("impact_factor", 60.0)
+ combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
+
+ # Sort by combined score and get top k results
+ sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
+ top_k_items = sorted_items[:k]
+
+ # Filter by score threshold
+ filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
+
+ # Create a map of chunk_id to chunk for both responses
+ chunk_map = {}
+ for c in vector_response.chunks:
+ chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
+ chunk_map[chunk_id] = c
+ for c in keyword_response.chunks:
+ chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
+ chunk_map[chunk_id] = c
+
+ # Use the map to look up chunks by their IDs
+ chunks = []
+ scores = []
+ for doc_id, score in filtered_items:
+ if doc_id in chunk_map:
+ chunks.append(chunk_map[doc_id])
+ scores.append(score)
+
+ return QueryChunksResponse(chunks=chunks, scores=scores)
+
class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
"""
@@ -301,9 +434,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
"""
- def __init__(self, config, inference_api: Inference) -> None:
+ def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None:
self.config = config
self.inference_api = inference_api
+ self.files_api = files_api
self.cache: dict[str, VectorDBWithIndex] = {}
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
@@ -343,7 +477,9 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
vector_db_data = row[0]
vector_db = VectorDB.model_validate_json(vector_db_data)
index = await SQLiteVecIndex.create(
- vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
+ vector_db.embedding_dimension,
+ self.config.db_path,
+ vector_db.identifier,
)
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
@@ -369,7 +505,11 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
connection.close()
await asyncio.to_thread(_register_db)
- index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
+ index = await SQLiteVecIndex.create(
+ vector_db.embedding_dimension,
+ self.config.db_path,
+ vector_db.identifier,
+ )
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
async def list_vector_dbs(self) -> list[VectorDB]:
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index d888c8420..55c1b5617 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -24,6 +24,7 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
deprecation_warning="Please use the `inline::faiss` provider instead.",
api_dependencies=[Api.inference],
+ optional_api_dependencies=[Api.files],
),
InlineProviderSpec(
api=Api.vector_io,
@@ -32,6 +33,7 @@ def available_providers() -> list[ProviderSpec]:
module="llama_stack.providers.inline.vector_io.faiss",
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
api_dependencies=[Api.inference],
+ optional_api_dependencies=[Api.files],
),
# NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
# source distribution and the wheels are not available for all platforms.
@@ -42,6 +44,7 @@ def available_providers() -> list[ProviderSpec]:
module="llama_stack.providers.inline.vector_io.sqlite_vec",
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
api_dependencies=[Api.inference],
+ optional_api_dependencies=[Api.files],
),
InlineProviderSpec(
api=Api.vector_io,
@@ -51,6 +54,7 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
api_dependencies=[Api.inference],
+ optional_api_dependencies=[Api.files],
),
remote_provider_spec(
Api.vector_io,
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 75a9e33e2..79b1b5f08 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -318,6 +318,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index 4c68322e0..cb6c6e279 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -316,6 +316,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
provider_model_id = await self._get_provider_model_id(model)
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 358a29d4c..d51072fbf 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -33,7 +33,6 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat,
LogProbConfig,
Message,
- OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
@@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
+ OpenAIEmbeddingsResponse,
+ OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
)
@@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import (
from llama_stack.providers.utils.inference.openai_compat import (
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
+ b64_encode_openai_embeddings_response,
get_sampling_options,
prepare_openai_completion_params,
+ prepare_openai_embeddings_params,
process_chat_completion_response,
process_chat_completion_stream_response,
process_completion_response,
@@ -386,7 +389,35 @@ class OllamaInferenceAdapter(
dimensions: int | None = None,
user: str | None = None,
) -> OpenAIEmbeddingsResponse:
- raise NotImplementedError()
+ model_obj = await self._get_model(model)
+ if model_obj.model_type != ModelType.embedding:
+ raise ValueError(f"Model {model} is not an embedding model")
+
+ if model_obj.provider_resource_id is None:
+ raise ValueError(f"Model {model} has no provider_resource_id set")
+
+ # Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters
+ params = prepare_openai_embeddings_params(
+ model=model_obj.provider_resource_id,
+ input=input,
+ encoding_format=encoding_format,
+ dimensions=dimensions,
+ user=user,
+ )
+
+ response = await self.openai_client.embeddings.create(**params)
+ data = b64_encode_openai_embeddings_response(response.data, encoding_format)
+
+ usage = OpenAIEmbeddingUsage(
+ prompt_tokens=response.usage.prompt_tokens,
+ total_tokens=response.usage.total_tokens,
+ )
+ # TODO: Investigate why model_obj.identifier is used instead of response.model
+ return OpenAIEmbeddingsResponse(
+ data=data,
+ model=model_obj.identifier,
+ usage=usage,
+ )
async def openai_completion(
self,
@@ -409,6 +440,7 @@ class OllamaInferenceAdapter(
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
if not isinstance(prompt, str):
raise ValueError("Ollama does not support non-string prompts for completion")
@@ -432,6 +464,7 @@ class OllamaInferenceAdapter(
temperature=temperature,
top_p=top_p,
user=user,
+ suffix=suffix,
)
return await self.openai_client.completions.create(**params) # type: ignore
diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py
index 6f3a686a8..ed4ec22aa 100644
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@@ -90,6 +90,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
if guided_choice is not None:
logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
@@ -117,6 +118,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
temperature=temperature,
top_p=top_p,
user=user,
+ suffix=suffix,
)
return await self._openai_client.completions.create(**params)
diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index 6cf4680e2..e9660abb9 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -242,6 +242,7 @@ class PassthroughInferenceAdapter(Inference):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
client = self._get_client()
model_obj = await self.model_store.get_model(model)
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 7305a638d..7030a644d 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -299,6 +299,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index d0a822f3c..16d133c81 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -559,6 +559,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
self._lazy_initialize_client()
model_obj = await self._get_model(model)
diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py
index 3ec5fce66..eedeb7baf 100644
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -313,6 +313,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py
index 0d8451eb2..027cdcb11 100644
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject,
VectorStoreSearchResponsePage,
)
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import (
@@ -104,6 +105,17 @@ class ChromaIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Chroma")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in Chroma")
+
class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
def __init__(
@@ -241,3 +253,12 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
rewrite_query: bool | None = False,
) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
+
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py
index 8ae74aedc..42ab4fa3e 100644
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@@ -25,6 +25,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject,
VectorStoreSearchResponsePage,
)
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import (
@@ -102,6 +103,17 @@ class MilvusIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Milvus")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in Milvus")
+
class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
def __init__(
@@ -240,6 +252,15 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")
+
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
index 7d58a49f3..1917af086 100644
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@@ -128,6 +128,17 @@ class PGVectorIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in PGVector")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in PGVector")
+
async def delete(self):
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 10f3b5b0d..fa7782f04 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject,
VectorStoreSearchResponsePage,
)
+from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import (
@@ -111,6 +112,17 @@ class QdrantIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Qdrant")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in Qdrant")
+
async def delete(self):
await self.client.delete_collection(collection_name=self.collection_name)
@@ -241,3 +253,12 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
rewrite_query: bool | None = False,
) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
+
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
index 6f2027dad..c63dd70c6 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@@ -92,6 +92,17 @@ class WeaviateIndex(EmbeddingIndex):
) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Weaviate")
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError("Hybrid search is not supported in Weaviate")
+
class WeaviateVectorIOAdapter(
VectorIO,
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index dab10bc55..c21f379c9 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -4,8 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
-import base64
-import struct
from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any
@@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
- OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
@@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
+ b64_encode_openai_embeddings_response,
convert_message_to_openai_dict_new,
convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream,
@@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin(
)
# Convert response to OpenAI format
- data = []
- for i, embedding_data in enumerate(response["data"]):
- # we encode to base64 if the encoding format is base64 in the request
- if encoding_format == "base64":
- byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
- embedding = base64.b64encode(byte_data).decode("utf-8")
- else:
- embedding = embedding_data["embedding"]
-
- data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
+ data = b64_encode_openai_embeddings_response(response.data, encoding_format)
usage = OpenAIEmbeddingUsage(
prompt_tokens=response["usage"]["prompt_tokens"],
@@ -336,6 +325,7 @@ class LiteLLMOpenAIMixin(
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index 049f06fdb..ff95b12a7 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -3,8 +3,10 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import base64
import json
import logging
+import struct
import time
import uuid
import warnings
@@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAICompletion,
OpenAICompletionChoice,
+ OpenAIEmbeddingData,
OpenAIMessageParam,
OpenAIResponseFormatParam,
ToolConfig,
@@ -1287,6 +1290,7 @@ class OpenAICompletionToLlamaStackMixin:
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
+ suffix: str | None = None,
) -> OpenAICompletion:
if stream:
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
@@ -1483,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin:
model=model,
object="chat.completion",
)
+
+
+def prepare_openai_embeddings_params(
+ model: str,
+ input: str | list[str],
+ encoding_format: str | None = "float",
+ dimensions: int | None = None,
+ user: str | None = None,
+):
+ if model is None:
+ raise ValueError("Model must be provided for embeddings")
+
+ input_list = [input] if isinstance(input, str) else input
+
+ params: dict[str, Any] = {
+ "model": model,
+ "input": input_list,
+ }
+
+ if encoding_format is not None:
+ params["encoding_format"] = encoding_format
+ if dimensions is not None:
+ params["dimensions"] = dimensions
+ if user is not None:
+ params["user"] = user
+
+ return params
+
+
+def b64_encode_openai_embeddings_response(
+ response_data: dict, encoding_format: str | None = "float"
+) -> list[OpenAIEmbeddingData]:
+ """
+ Process the OpenAI embeddings response to encode the embeddings in base64 format if specified.
+ """
+ data = []
+ for i, embedding_data in enumerate(response_data):
+ if encoding_format == "base64":
+ byte_array = bytearray()
+ for embedding_value in embedding_data.embedding:
+ byte_array.extend(struct.pack("f", float(embedding_value)))
+
+ response_embedding = base64.b64encode(byte_array).decode("utf-8")
+ else:
+ response_embedding = embedding_data.embedding
+ data.append(
+ OpenAIEmbeddingData(
+ embedding=response_embedding,
+ index=i,
+ )
+ )
+ return data
diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
index 7d8163ed2..f9701897a 100644
--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@@ -5,11 +5,13 @@
# the root directory of this source tree.
import logging
+import mimetypes
import time
import uuid
from abc import ABC, abstractmethod
from typing import Any
+from llama_stack.apis.files import Files
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import (
QueryChunksResponse,
@@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
VectorStoreSearchResponse,
VectorStoreSearchResponsePage,
)
+from llama_stack.apis.vector_io.vector_io import (
+ Chunk,
+ VectorStoreChunkingStrategy,
+ VectorStoreChunkingStrategyAuto,
+ VectorStoreChunkingStrategyStatic,
+ VectorStoreFileLastError,
+ VectorStoreFileObject,
+)
+from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks
logger = logging.getLogger(__name__)
@@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):
# These should be provided by the implementing class
openai_vector_stores: dict[str, dict[str, Any]]
+ files_api: Files | None
@abstractmethod
async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
@@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
"""Unregister a vector database (provider-specific implementation)."""
pass
+ @abstractmethod
+ async def insert_chunks(
+ self,
+ vector_db_id: str,
+ chunks: list[Chunk],
+ ttl_seconds: int | None = None,
+ ) -> None:
+ """Insert chunks into a vector database (provider-specific implementation)."""
+ pass
+
@abstractmethod
async def query_chunks(
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
@@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC):
if metadata[key] != value:
return False
return True
+
+ async def openai_attach_file_to_vector_store(
+ self,
+ vector_store_id: str,
+ file_id: str,
+ attributes: dict[str, Any] | None = None,
+ chunking_strategy: VectorStoreChunkingStrategy | None = None,
+ ) -> VectorStoreFileObject:
+ attributes = attributes or {}
+ chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
+
+ vector_store_file_object = VectorStoreFileObject(
+ id=file_id,
+ attributes=attributes,
+ chunking_strategy=chunking_strategy,
+ created_at=int(time.time()),
+ status="in_progress",
+ vector_store_id=vector_store_id,
+ )
+
+ if not hasattr(self, "files_api") or not self.files_api:
+ vector_store_file_object.status = "failed"
+ vector_store_file_object.last_error = VectorStoreFileLastError(
+ code="server_error",
+ message="Files API is not available",
+ )
+ return vector_store_file_object
+
+ if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
+ max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
+ chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
+ else:
+ # Default values from OpenAI API spec
+ max_chunk_size_tokens = 800
+ chunk_overlap_tokens = 400
+
+ try:
+ file_response = await self.files_api.openai_retrieve_file(file_id)
+ mime_type, _ = mimetypes.guess_type(file_response.filename)
+ content_response = await self.files_api.openai_retrieve_file_content(file_id)
+
+ content = content_from_data_and_mime_type(content_response.body, mime_type)
+
+ chunks = make_overlapped_chunks(
+ file_id,
+ content,
+ max_chunk_size_tokens,
+ chunk_overlap_tokens,
+ attributes,
+ )
+
+ if not chunks:
+ vector_store_file_object.status = "failed"
+ vector_store_file_object.last_error = VectorStoreFileLastError(
+ code="server_error",
+ message="No chunks were generated from the file",
+ )
+ return vector_store_file_object
+
+ await self.insert_chunks(
+ vector_db_id=vector_store_id,
+ chunks=chunks,
+ )
+ except Exception as e:
+ logger.error(f"Error attaching file to vector store: {e}")
+ vector_store_file_object.status = "failed"
+ vector_store_file_object.last_error = VectorStoreFileLastError(
+ code="server_error",
+ message=str(e),
+ )
+ return vector_store_file_object
+
+ vector_store_file_object.status = "completed"
+
+ return vector_store_file_object
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 4cd15860b..a6e420feb 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
log = logging.getLogger(__name__)
+# Constants for reranker types
+RERANKER_TYPE_RRF = "rrf"
+RERANKER_TYPE_WEIGHTED = "weighted"
+
def parse_pdf(data: bytes) -> str:
# For PDF and DOC/DOCX files, we can't reliably convert to string
@@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str:
data = unquote(data)
encoding = parts["encoding"] or "utf-8"
data = data.encode(encoding)
+ return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))
- encoding = parts["encoding"]
- if not encoding:
- import chardet
- detected = chardet.detect(data)
- encoding = detected["encoding"]
+def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
+ if isinstance(data, bytes):
+ if not encoding:
+ import chardet
- mime_type = parts["mimetype"]
- mime_category = mime_type.split("/")[0]
+ detected = chardet.detect(data)
+ encoding = detected["encoding"]
+
+ mime_category = mime_type.split("/")[0] if mime_type else None
if mime_category == "text":
# For text-based files (including CSV, MD)
return data.decode(encoding)
@@ -200,6 +206,18 @@ class EmbeddingIndex(ABC):
async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
raise NotImplementedError()
+ @abstractmethod
+ async def query_hybrid(
+ self,
+ embedding: NDArray,
+ query_string: str,
+ k: int,
+ score_threshold: float,
+ reranker_type: str,
+ reranker_params: dict[str, Any] | None = None,
+ ) -> QueryChunksResponse:
+ raise NotImplementedError()
+
@abstractmethod
async def delete(self):
raise NotImplementedError()
@@ -243,10 +261,29 @@ class VectorDBWithIndex:
k = params.get("max_chunks", 3)
mode = params.get("mode")
score_threshold = params.get("score_threshold", 0.0)
+
+ # Get ranker configuration
+ ranker = params.get("ranker")
+ if ranker is None:
+ # Default to RRF with impact_factor=60.0
+ reranker_type = RERANKER_TYPE_RRF
+ reranker_params = {"impact_factor": 60.0}
+ else:
+ reranker_type = ranker.type
+ reranker_params = (
+ {"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
+ )
+
query_string = interleaved_content_as_str(query)
if mode == "keyword":
return await self.index.query_keyword(query_string, k, score_threshold)
+
+ # Calculate embeddings for both vector and hybrid modes
+ embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
+ query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+ if mode == "hybrid":
+ return await self.index.query_hybrid(
+ query_vector, query_string, k, score_threshold, reranker_type, reranker_params
+ )
else:
- embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
- query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
return await self.index.query_vector(query_vector, k, score_threshold)
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index 36a120897..ebe0849f3 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -23,6 +23,8 @@ distribution_spec:
- inline::basic
- inline::llm-as-judge
- inline::braintrust
+ files:
+ - inline::localfs
post_training:
- inline::huggingface
tool_runtime:
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index 0b4f05128..46c4852a4 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
ShieldInput,
ToolGroupInput,
)
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
@@ -29,6 +30,7 @@ def get_distribution_template() -> DistributionTemplate:
"eval": ["inline::meta-reference"],
"datasetio": ["remote::huggingface", "inline::localfs"],
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+ "files": ["inline::localfs"],
"post_training": ["inline::huggingface"],
"tool_runtime": [
"remote::brave-search",
@@ -49,6 +51,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="inline::faiss",
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
+ files_provider = Provider(
+ provider_id="meta-reference-files",
+ provider_type="inline::localfs",
+ config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+ )
posttraining_provider = Provider(
provider_id="huggingface",
provider_type="inline::huggingface",
@@ -98,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
+ "files": [files_provider],
"post_training": [posttraining_provider],
},
default_models=[inference_model, embedding_model],
@@ -107,6 +115,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={
"inference": [inference_provider],
"vector_io": [vector_io_provider_faiss],
+ "files": [files_provider],
"post_training": [posttraining_provider],
"safety": [
Provider(
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 7bf9fc3bd..85d5c813b 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -4,6 +4,7 @@ apis:
- agents
- datasetio
- eval
+- files
- inference
- post_training
- safety
@@ -84,6 +85,14 @@ providers:
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:}
+ files:
+ - provider_id: meta-reference-files
+ provider_type: inline::localfs
+ config:
+ storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
+ metadata_store:
+ type: sqlite
+ db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
post_training:
- provider_id: huggingface
provider_type: inline::huggingface
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 0030bcd60..2d10a99a4 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -4,6 +4,7 @@ apis:
- agents
- datasetio
- eval
+- files
- inference
- post_training
- safety
@@ -82,6 +83,14 @@ providers:
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:}
+ files:
+ - provider_id: meta-reference-files
+ provider_type: inline::localfs
+ config:
+ storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
+ metadata_store:
+ type: sqlite
+ db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
post_training:
- provider_id: huggingface
provider_type: inline::huggingface
diff --git a/llama_stack/templates/starter/build.yaml b/llama_stack/templates/starter/build.yaml
index 5fd3cc3f5..9bf4913a7 100644
--- a/llama_stack/templates/starter/build.yaml
+++ b/llama_stack/templates/starter/build.yaml
@@ -17,6 +17,8 @@ distribution_spec:
- inline::sqlite-vec
- remote::chromadb
- remote::pgvector
+ files:
+ - inline::localfs
safety:
- inline::llama-guard
agents:
diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml
index 4732afa77..319ababe5 100644
--- a/llama_stack/templates/starter/run.yaml
+++ b/llama_stack/templates/starter/run.yaml
@@ -4,6 +4,7 @@ apis:
- agents
- datasetio
- eval
+- files
- inference
- safety
- scoring
@@ -75,6 +76,14 @@ providers:
db: ${env.PGVECTOR_DB:}
user: ${env.PGVECTOR_USER:}
password: ${env.PGVECTOR_PASSWORD:}
+ files:
+ - provider_id: meta-reference-files
+ provider_type: inline::localfs
+ config:
+ storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/starter/files}
+ metadata_store:
+ type: sqlite
+ db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/files_metadata.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py
index 650ecc87f..2a44a0a37 100644
--- a/llama_stack/templates/starter/starter.py
+++ b/llama_stack/templates/starter/starter.py
@@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import (
ShieldInput,
ToolGroupInput,
)
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
@@ -134,6 +135,7 @@ def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
"vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+ "files": ["inline::localfs"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
@@ -170,6 +172,11 @@ def get_distribution_template() -> DistributionTemplate:
),
),
]
+ files_provider = Provider(
+ provider_id="meta-reference-files",
+ provider_type="inline::localfs",
+ config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+ )
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
@@ -212,6 +219,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={
"inference": inference_providers + [embedding_provider],
"vector_io": vector_io_providers,
+ "files": [files_provider],
},
default_models=default_models + [embedding_model],
default_tool_groups=default_tool_groups,
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index 461527d18..3e43af272 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -22,9 +22,6 @@ def provider_from_model(client_with_models, model_id):
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
- if isinstance(client_with_models, LlamaStackAsLibraryClient):
- pytest.skip("OpenAI completions are not supported when testing with library client yet.")
-
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"inline::meta-reference",
@@ -44,6 +41,23 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
+def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
+ # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
+ # Use this to specifically test this API functionality.
+
+ # pytest -sv --stack-config="inference=ollama" \
+ # tests/integration/inference/test_openai_completion.py \
+ # --text-model qwen2.5-coder:1.5b \
+ # -k test_openai_completion_non_streaming_suffix
+
+ if model_id != "qwen2.5-coder:1.5b":
+ pytest.skip(f"Suffix is not supported for the model: {model_id}.")
+
+ provider = provider_from_model(client_with_models, model_id)
+ if provider.provider_type != "remote::ollama":
+ pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
+
+
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
@@ -102,6 +116,32 @@ def test_openai_completion_non_streaming(llama_stack_client, client_with_models,
assert len(choice.text) > 10
+@pytest.mark.parametrize(
+ "test_case",
+ [
+ "inference:completion:suffix",
+ ],
+)
+def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_models, text_model_id, test_case):
+ skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+ skip_if_model_doesnt_support_suffix(client_with_models, text_model_id)
+ tc = TestCase(test_case)
+
+ # ollama needs more verbose prompting for some reason here...
+ response = llama_stack_client.completions.create(
+ model=text_model_id,
+ prompt=tc["content"],
+ stream=False,
+ suffix=tc["suffix"],
+ max_tokens=10,
+ )
+
+ assert len(response.choices) > 0
+ choice = response.choices[0]
+ assert len(choice.text) > 5
+ assert "france" in choice.text.lower()
+
+
@pytest.mark.parametrize(
"test_case",
[
diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py
index 90a91a206..1b8bd9038 100644
--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
"remote::runpod",
"remote::sambanova",
"remote::tgi",
- "remote::ollama",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")
diff --git a/tests/integration/test_cases/inference/completion.json b/tests/integration/test_cases/inference/completion.json
index 731ceddbc..baaecb375 100644
--- a/tests/integration/test_cases/inference/completion.json
+++ b/tests/integration/test_cases/inference/completion.json
@@ -4,6 +4,12 @@
"content": "Complete the sentence using one word: Roses are red, violets are "
}
},
+ "suffix": {
+ "data": {
+ "content": "The capital of ",
+ "suffix": "is Paris."
+ }
+ },
"non_streaming": {
"data": {
"content": "Micheael Jordan is born in ",
diff --git a/tests/unit/providers/vector_io/test_sqlite_vec.py b/tests/unit/providers/vector_io/test_sqlite_vec.py
index 010a0ca42..6424b9e86 100644
--- a/tests/unit/providers/vector_io/test_sqlite_vec.py
+++ b/tests/unit/providers/vector_io/test_sqlite_vec.py
@@ -84,6 +84,28 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa
assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings):
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Create a query embedding that's similar to the first chunk
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 5"
+
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ assert len(response.chunks) == 3, f"Expected 3 results, got {len(response.chunks)}"
+ # Verify scores are in descending order (higher is better)
+ assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
@pytest.mark.asyncio
async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
# Re-initialize with a clean index
@@ -141,3 +163,355 @@ def test_generate_chunk_id():
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d",
]
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
+ """Test hybrid search when keyword search returns no matches - should still return vector results."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Use a non-existent keyword but a valid vector query
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 499"
+
+ # First verify keyword search returns no results
+ keyword_response = await sqlite_vec_index.query_keyword(query_string, k=5, score_threshold=0.0)
+ assert len(keyword_response.chunks) == 0, "Keyword search should return no results"
+
+ # Get hybrid results
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ # Should still get results from vector search
+ assert len(response.chunks) > 0, "Should get results from vector search even with no keyword matches"
+ # Verify scores are in descending order
+ assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings):
+ """Test hybrid search with a high score threshold."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Use a very high score threshold that no results will meet
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 5"
+
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=1000.0, # Very high threshold
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ # Should return no results due to high threshold
+ assert len(response.chunks) == 0
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_different_embedding(
+ sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension
+):
+ """Test hybrid search with a different embedding than the stored ones."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Create a random embedding that's different from stored ones
+ query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
+ query_string = "Sentence 5"
+
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ # Should still get results if keyword matches exist
+ assert len(response.chunks) > 0
+ # Verify scores are in descending order
+ assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings):
+ """Test that RRF properly combines rankings when documents appear in both search methods."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Create a query embedding that's similar to the first chunk
+ query_embedding = sample_embeddings[0]
+ # Use a keyword that appears in multiple documents
+ query_string = "Sentence 5"
+
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=5,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ # Verify we get results from both search methods
+ assert len(response.chunks) > 0
+ # Verify scores are in descending order (RRF should maintain this)
+ assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings):
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Create a query embedding that's similar to the first chunk
+ query_embedding = sample_embeddings[0]
+ # Use a keyword that appears in the first document
+ query_string = "Sentence 0 from document 0"
+
+ # Test weighted re-ranking
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="weighted",
+ reranker_params={"alpha": 0.5},
+ )
+ assert len(response.chunks) == 1
+ # Score should be weighted average of normalized keyword score and vector score
+ assert response.scores[0] > 0.5 # Both scores should be high
+
+ # Test RRF re-ranking
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ assert len(response.chunks) == 1
+ # RRF score should be sum of reciprocal ranks
+ assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # 1/(60+1) + 1/(60+1)
+
+ # Test default re-ranking (should be RRF)
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ assert len(response.chunks) == 1
+ assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # Should behave like RRF
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings):
+ """Test hybrid search with documents that appear in only one search method."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # Create a query embedding that's similar to the first chunk
+ query_embedding = sample_embeddings[0]
+ # Use a keyword that appears in a different document
+ query_string = "Sentence 9 from document 2"
+
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+
+ # Should get results from both search methods
+ assert len(response.chunks) > 0
+ # Verify scores are in descending order
+ assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
+ # Verify we get results from both the vector-similar document and keyword-matched document
+ doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks}
+ assert "document-0" in doc_ids # From vector search
+ assert "document-2" in doc_ids # From keyword search
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_weighted_reranker_parametrization(
+ sqlite_vec_index, sample_chunks, sample_embeddings
+):
+ """Test WeightedReRanker with different alpha values."""
+ # Re-add data before each search to ensure test isolation
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 0 from document 0"
+
+ # alpha=1.0 (should behave like pure keyword)
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="weighted",
+ reranker_params={"alpha": 1.0},
+ )
+ assert len(response.chunks) > 0 # Should get at least one result
+ assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+ # alpha=0.0 (should behave like pure vector)
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="weighted",
+ reranker_params={"alpha": 0.0},
+ )
+ assert len(response.chunks) > 0 # Should get at least one result
+ assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+ # alpha=0.7 (should be a mix)
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="weighted",
+ reranker_params={"alpha": 0.7},
+ )
+ assert len(response.chunks) > 0 # Should get at least one result
+ assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings):
+ """Test RRFReRanker with different impact factors."""
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 0 from document 0"
+
+ # impact_factor=10
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 10.0},
+ )
+ assert len(response.chunks) == 1
+ assert response.scores[0] == pytest.approx(2.0 / 11.0, rel=1e-6)
+
+ # impact_factor=100
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=1,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 100.0},
+ )
+ assert len(response.chunks) == 1
+ assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6)
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings):
+ await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
+
+ # No results from either search - use a completely different embedding and a nonzero threshold
+ query_embedding = np.ones_like(sample_embeddings[0]) * -1 # Very different from sample embeddings
+ query_string = "no_such_keyword_that_will_never_match"
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=0.1, # Nonzero threshold to filter out low-similarity matches
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ assert len(response.chunks) == 0
+
+ # All results below threshold
+ query_embedding = sample_embeddings[0]
+ query_string = "Sentence 0 from document 0"
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=3,
+ score_threshold=1000.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ assert len(response.chunks) == 0
+
+ # Large k value
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=100,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ # Should not error, should return all available results
+ assert len(response.chunks) > 0
+ assert len(response.chunks) <= 100
+
+
+@pytest.mark.asyncio
+async def test_query_chunks_hybrid_tie_breaking(
+ sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
+):
+ """Test tie-breaking and determinism when scores are equal."""
+ # Create two chunks with the same content and embedding
+ chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
+ chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
+ chunks = [chunk1, chunk2]
+ # Use the same embedding for both chunks to ensure equal scores
+ same_embedding = sample_embeddings[0]
+ embeddings = np.array([same_embedding, same_embedding])
+
+ # Clear existing data and recreate index
+ await sqlite_vec_index.delete()
+ temp_dir = tmp_path_factory.getbasetemp()
+ db_path = str(temp_dir / "test_sqlite.db")
+ sqlite_vec_index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank")
+ await sqlite_vec_index.add_chunks(chunks, embeddings)
+
+ # Query with the same embedding and content to ensure equal scores
+ query_embedding = same_embedding
+ query_string = "identical"
+
+ # Run multiple queries to verify determinism
+ responses = []
+ for _ in range(3):
+ response = await sqlite_vec_index.query_hybrid(
+ embedding=query_embedding,
+ query_string=query_string,
+ k=2,
+ score_threshold=0.0,
+ reranker_type="rrf",
+ reranker_params={"impact_factor": 60.0},
+ )
+ responses.append(response)
+
+ # Verify all responses are identical
+ first_response = responses[0]
+ for response in responses[1:]:
+ assert response.chunks == first_response.chunks
+ assert response.scores == first_response.scores
+
+ # Verify both chunks are returned with equal scores
+ assert len(first_response.chunks) == 2
+ assert first_response.scores[0] == first_response.scores[1]
+ assert {chunk.metadata["document_id"] for chunk in first_response.chunks} == {"docA", "docB"}
diff --git a/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf b/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf
new file mode 100644
index 000000000..25579f425
Binary files /dev/null and b/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf differ
diff --git a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
index 4d6c19b59..1acf06388 100644
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@@ -31,6 +31,25 @@ test_response_web_search:
search_context_size: "low"
output: "128"
+test_response_file_search:
+ test_name: test_response_file_search
+ test_params:
+ case:
+ - case_id: "llama_experts"
+ input: "How many experts does the Llama 4 Maverick model have?"
+ tools:
+ - type: file_search
+ # vector_store_ids param for file_search tool gets added by the test runner
+ file_content: "Llama 4 Maverick has 128 experts"
+ output: "128"
+ - case_id: "llama_experts_pdf"
+ input: "How many experts does the Llama 4 Maverick model have?"
+ tools:
+ - type: file_search
+ # vector_store_ids param for file_search toolgets added by the test runner
+ file_path: "pdfs/llama_stack_and_models.pdf"
+ output: "128"
+
test_response_mcp_tool:
test_name: test_response_mcp_tool
test_params:
diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py
index 28020d3b1..1c9cdaa3a 100644
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@@ -5,6 +5,8 @@
# the root directory of this source tree.
import json
+import os
+import time
import httpx
import openai
@@ -23,6 +25,31 @@ from tests.verifications.openai_api.fixtures.load import load_test_cases
responses_test_cases = load_test_cases("responses")
+def _new_vector_store(openai_client, name):
+ # Ensure we don't reuse an existing vector store
+ vector_stores = openai_client.vector_stores.list()
+ for vector_store in vector_stores:
+ if vector_store.name == name:
+ openai_client.vector_stores.delete(vector_store_id=vector_store.id)
+
+ # Create a new vector store
+ vector_store = openai_client.vector_stores.create(
+ name=name,
+ )
+ return vector_store
+
+
+def _upload_file(openai_client, name, file_path):
+ # Ensure we don't reuse an existing file
+ files = openai_client.files.list()
+ for file in files:
+ if file.filename == name:
+ openai_client.files.delete(file_id=file.id)
+
+ # Upload a text file with our document content
+ return openai_client.files.create(file=open(file_path, "rb"), purpose="assistants")
+
+
@pytest.mark.parametrize(
"case",
responses_test_cases["test_response_basic"]["test_params"]["case"],
@@ -258,6 +285,111 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
assert case["output"].lower() in response.output_text.lower().strip()
+@pytest.mark.parametrize(
+ "case",
+ responses_test_cases["test_response_file_search"]["test_params"]["case"],
+ ids=case_id_generator,
+)
+def test_response_non_streaming_file_search(
+ request, openai_client, model, provider, verification_config, tmp_path, case
+):
+ if isinstance(openai_client, LlamaStackAsLibraryClient):
+ pytest.skip("Responses API file search is not yet supported in library client.")
+
+ test_name_base = get_base_test_name(request)
+ if should_skip_test(verification_config, provider, model, test_name_base):
+ pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+ vector_store = _new_vector_store(openai_client, "test_vector_store")
+
+ if "file_content" in case:
+ file_name = "test_response_non_streaming_file_search.txt"
+ file_path = tmp_path / file_name
+ file_path.write_text(case["file_content"])
+ elif "file_path" in case:
+ file_path = os.path.join(os.path.dirname(__file__), "fixtures", case["file_path"])
+ file_name = os.path.basename(file_path)
+ else:
+ raise ValueError(f"No file content or path provided for case {case['case_id']}")
+
+ file_response = _upload_file(openai_client, file_name, file_path)
+
+ # Attach our file to the vector store
+ file_attach_response = openai_client.vector_stores.files.create(
+ vector_store_id=vector_store.id,
+ file_id=file_response.id,
+ )
+
+ # Wait for the file to be attached
+ while file_attach_response.status == "in_progress":
+ time.sleep(0.1)
+ file_attach_response = openai_client.vector_stores.files.retrieve(
+ vector_store_id=vector_store.id,
+ file_id=file_response.id,
+ )
+ assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}"
+ assert not file_attach_response.last_error
+
+ # Update our tools with the right vector store id
+ tools = case["tools"]
+ for tool in tools:
+ if tool["type"] == "file_search":
+ tool["vector_store_ids"] = [vector_store.id]
+
+ # Create the response request, which should query our vector store
+ response = openai_client.responses.create(
+ model=model,
+ input=case["input"],
+ tools=tools,
+ stream=False,
+ include=["file_search_call.results"],
+ )
+
+ # Verify the file_search_tool was called
+ assert len(response.output) > 1
+ assert response.output[0].type == "file_search_call"
+ assert response.output[0].status == "completed"
+ assert response.output[0].queries # ensure it's some non-empty list
+ assert response.output[0].results
+ assert case["output"].lower() in response.output[0].results[0].text.lower()
+ assert response.output[0].results[0].score > 0
+
+ # Verify the output_text generated by the response
+ assert case["output"].lower() in response.output_text.lower().strip()
+
+
+def test_response_non_streaming_file_search_empty_vector_store(
+ request, openai_client, model, provider, verification_config
+):
+ if isinstance(openai_client, LlamaStackAsLibraryClient):
+ pytest.skip("Responses API file search is not yet supported in library client.")
+
+ test_name_base = get_base_test_name(request)
+ if should_skip_test(verification_config, provider, model, test_name_base):
+ pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+ vector_store = _new_vector_store(openai_client, "test_vector_store")
+
+ # Create the response request, which should query our vector store
+ response = openai_client.responses.create(
+ model=model,
+ input="How many experts does the Llama 4 Maverick model have?",
+ tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
+ stream=False,
+ include=["file_search_call.results"],
+ )
+
+ # Verify the file_search_tool was called
+ assert len(response.output) > 1
+ assert response.output[0].type == "file_search_call"
+ assert response.output[0].status == "completed"
+ assert response.output[0].queries # ensure it's some non-empty list
+ assert not response.output[0].results # ensure we don't get any results
+
+ # Verify some output_text was generated by the response
+ assert response.output_text
+
+
@pytest.mark.parametrize(
"case",
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],