diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 96de04ec9..fddce0c57 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -3240,6 +3240,59 @@ } } }, + "/v1/openai/v1/vector_stores/{vector_store_id}/files": { + "post": { + "responses": { + "200": { + "description": "A VectorStoreFileObject representing the attached file.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/VectorStoreFileObject" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "VectorIO" + ], + "description": "Attach a file to a vector store.", + "parameters": [ + { + "name": "vector_store_id", + "in": "path", + "description": "The ID of the vector store to attach the file to.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenaiAttachFileToVectorStoreRequest" + } + } + }, + "required": true + } + } + }, "/v1/openai/v1/completions": { "post": { "responses": { @@ -7047,6 +7100,9 @@ { "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall" + }, { "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" }, @@ -7193,12 +7249,41 @@ "const": "file_search", "default": "file_search" }, - "vector_store_id": { + "vector_store_ids": { "type": "array", "items": { "type": "string" } }, + "filters": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "max_num_results": { + "type": "integer", + "default": 10 + }, "ranking_options": { "type": "object", "properties": { @@ -7217,7 +7302,7 @@ "additionalProperties": false, "required": [ "type", - "vector_store_id" + "vector_store_ids" ], "title": "OpenAIResponseInputToolFileSearch" }, @@ -7484,6 +7569,64 @@ ], "title": "OpenAIResponseOutputMessageContentOutputText" }, + "OpenAIResponseOutputMessageFileSearchToolCall": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "queries": { + "type": "array", + "items": { + "type": "string" + } + }, + "status": { + "type": "string" + }, + "type": { + "type": "string", + "const": "file_search_call", + "default": "file_search_call" + }, + "results": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + } + }, + "additionalProperties": false, + "required": [ + "id", + "queries", + "status", + "type" + ], + "title": "OpenAIResponseOutputMessageFileSearchToolCall" + }, "OpenAIResponseOutputMessageFunctionToolCall": { "type": "object", "properties": { @@ -7760,6 +7903,9 @@ { "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" }, + { + "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall" + }, { "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" }, @@ -7775,6 +7921,7 @@ "mapping": { "message": "#/components/schemas/OpenAIResponseMessage", "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall", + "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall", "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall", "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" @@ -11766,6 +11913,232 @@ ], "title": "LogEventRequest" }, + "VectorStoreChunkingStrategy": { + "oneOf": [ + { + "$ref": "#/components/schemas/VectorStoreChunkingStrategyAuto" + }, + { + "$ref": "#/components/schemas/VectorStoreChunkingStrategyStatic" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "auto": "#/components/schemas/VectorStoreChunkingStrategyAuto", + "static": "#/components/schemas/VectorStoreChunkingStrategyStatic" + } + } + }, + "VectorStoreChunkingStrategyAuto": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "auto", + "default": "auto" + } + }, + "additionalProperties": false, + "required": [ + "type" + ], + "title": "VectorStoreChunkingStrategyAuto" + }, + "VectorStoreChunkingStrategyStatic": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "static", + "default": "static" + }, + "static": { + "$ref": "#/components/schemas/VectorStoreChunkingStrategyStaticConfig" + } + }, + "additionalProperties": false, + "required": [ + "type", + "static" + ], + "title": "VectorStoreChunkingStrategyStatic" + }, + "VectorStoreChunkingStrategyStaticConfig": { + "type": "object", + "properties": { + "chunk_overlap_tokens": { + "type": "integer", + "default": 400 + }, + "max_chunk_size_tokens": { + "type": "integer", + "default": 800 + } + }, + "additionalProperties": false, + "required": [ + "chunk_overlap_tokens", + "max_chunk_size_tokens" + ], + "title": "VectorStoreChunkingStrategyStaticConfig" + }, + "OpenaiAttachFileToVectorStoreRequest": { + "type": "object", + "properties": { + "file_id": { + "type": "string", + "description": "The ID of the file to attach to the vector store." + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "The key-value attributes stored with the file, which can be used for filtering." + }, + "chunking_strategy": { + "$ref": "#/components/schemas/VectorStoreChunkingStrategy", + "description": "The chunking strategy to use for the file." + } + }, + "additionalProperties": false, + "required": [ + "file_id" + ], + "title": "OpenaiAttachFileToVectorStoreRequest" + }, + "VectorStoreFileLastError": { + "type": "object", + "properties": { + "code": { + "oneOf": [ + { + "type": "string", + "const": "server_error" + }, + { + "type": "string", + "const": "rate_limit_exceeded" + } + ] + }, + "message": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "code", + "message" + ], + "title": "VectorStoreFileLastError" + }, + "VectorStoreFileObject": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "object": { + "type": "string", + "default": "vector_store.file" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "chunking_strategy": { + "$ref": "#/components/schemas/VectorStoreChunkingStrategy" + }, + "created_at": { + "type": "integer" + }, + "last_error": { + "$ref": "#/components/schemas/VectorStoreFileLastError" + }, + "status": { + "oneOf": [ + { + "type": "string", + "const": "completed" + }, + { + "type": "string", + "const": "in_progress" + }, + { + "type": "string", + "const": "cancelled" + }, + { + "type": "string", + "const": "failed" + } + ] + }, + "usage_bytes": { + "type": "integer", + "default": 0 + }, + "vector_store_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "id", + "object", + "attributes", + "chunking_strategy", + "created_at", + "status", + "usage_bytes", + "vector_store_id" + ], + "title": "VectorStoreFileObject", + "description": "OpenAI Vector Store File object." + }, "OpenAIJSONSchema": { "type": "object", "properties": { @@ -12404,6 +12777,10 @@ }, "prompt_logprobs": { "type": "integer" + }, + "suffix": { + "type": "string", + "description": "(Optional) The suffix that should be appended to the completion." } }, "additionalProperties": false, @@ -13621,7 +13998,11 @@ }, "mode": { "type": "string", - "description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"." + "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"." + }, + "ranker": { + "$ref": "#/components/schemas/Ranker", + "description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker." } }, "additionalProperties": false, @@ -13651,6 +14032,69 @@ } } }, + "RRFRanker": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "rrf", + "default": "rrf", + "description": "The type of ranker, always \"rrf\"" + }, + "impact_factor": { + "type": "number", + "default": 60.0, + "description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009)." + } + }, + "additionalProperties": false, + "required": [ + "type", + "impact_factor" + ], + "title": "RRFRanker", + "description": "Reciprocal Rank Fusion (RRF) ranker configuration." + }, + "Ranker": { + "oneOf": [ + { + "$ref": "#/components/schemas/RRFRanker" + }, + { + "$ref": "#/components/schemas/WeightedRanker" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "rrf": "#/components/schemas/RRFRanker", + "weighted": "#/components/schemas/WeightedRanker" + } + } + }, + "WeightedRanker": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted", + "default": "weighted", + "description": "The type of ranker, always \"weighted\"" + }, + "alpha": { + "type": "number", + "default": 0.5, + "description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores." + } + }, + "additionalProperties": false, + "required": [ + "type", + "alpha" + ], + "title": "WeightedRanker", + "description": "Weighted ranker configuration that combines vector and keyword scores." + }, "QueryRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index b2fe870be..49388939f 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2263,6 +2263,43 @@ paths: schema: $ref: '#/components/schemas/LogEventRequest' required: true + /v1/openai/v1/vector_stores/{vector_store_id}/files: + post: + responses: + '200': + description: >- + A VectorStoreFileObject representing the attached file. + content: + application/json: + schema: + $ref: '#/components/schemas/VectorStoreFileObject' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - VectorIO + description: Attach a file to a vector store. + parameters: + - name: vector_store_id + in: path + description: >- + The ID of the vector store to attach the file to. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/OpenaiAttachFileToVectorStoreRequest' + required: true /v1/openai/v1/completions: post: responses: @@ -5021,6 +5058,7 @@ components: OpenAIResponseInput: oneOf: - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' - $ref: '#/components/schemas/OpenAIResponseMessage' @@ -5115,10 +5153,23 @@ components: type: string const: file_search default: file_search - vector_store_id: + vector_store_ids: type: array items: type: string + filters: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + max_num_results: + type: integer + default: 10 ranking_options: type: object properties: @@ -5132,7 +5183,7 @@ components: additionalProperties: false required: - type - - vector_store_id + - vector_store_ids title: OpenAIResponseInputToolFileSearch OpenAIResponseInputToolFunction: type: object @@ -5294,6 +5345,41 @@ components: - type title: >- OpenAIResponseOutputMessageContentOutputText + "OpenAIResponseOutputMessageFileSearchToolCall": + type: object + properties: + id: + type: string + queries: + type: array + items: + type: string + status: + type: string + type: + type: string + const: file_search_call + default: file_search_call + results: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - id + - queries + - status + - type + title: >- + OpenAIResponseOutputMessageFileSearchToolCall "OpenAIResponseOutputMessageFunctionToolCall": type: object properties: @@ -5491,6 +5577,7 @@ components: oneOf: - $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' @@ -5499,6 +5586,7 @@ components: mapping: message: '#/components/schemas/OpenAIResponseMessage' web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' + file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall' function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' @@ -8251,6 +8339,148 @@ components: - event - ttl_seconds title: LogEventRequest + VectorStoreChunkingStrategy: + oneOf: + - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto' + - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic' + discriminator: + propertyName: type + mapping: + auto: '#/components/schemas/VectorStoreChunkingStrategyAuto' + static: '#/components/schemas/VectorStoreChunkingStrategyStatic' + VectorStoreChunkingStrategyAuto: + type: object + properties: + type: + type: string + const: auto + default: auto + additionalProperties: false + required: + - type + title: VectorStoreChunkingStrategyAuto + VectorStoreChunkingStrategyStatic: + type: object + properties: + type: + type: string + const: static + default: static + static: + $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig' + additionalProperties: false + required: + - type + - static + title: VectorStoreChunkingStrategyStatic + VectorStoreChunkingStrategyStaticConfig: + type: object + properties: + chunk_overlap_tokens: + type: integer + default: 400 + max_chunk_size_tokens: + type: integer + default: 800 + additionalProperties: false + required: + - chunk_overlap_tokens + - max_chunk_size_tokens + title: VectorStoreChunkingStrategyStaticConfig + OpenaiAttachFileToVectorStoreRequest: + type: object + properties: + file_id: + type: string + description: >- + The ID of the file to attach to the vector store. + attributes: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + The key-value attributes stored with the file, which can be used for filtering. + chunking_strategy: + $ref: '#/components/schemas/VectorStoreChunkingStrategy' + description: >- + The chunking strategy to use for the file. + additionalProperties: false + required: + - file_id + title: OpenaiAttachFileToVectorStoreRequest + VectorStoreFileLastError: + type: object + properties: + code: + oneOf: + - type: string + const: server_error + - type: string + const: rate_limit_exceeded + message: + type: string + additionalProperties: false + required: + - code + - message + title: VectorStoreFileLastError + VectorStoreFileObject: + type: object + properties: + id: + type: string + object: + type: string + default: vector_store.file + attributes: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + chunking_strategy: + $ref: '#/components/schemas/VectorStoreChunkingStrategy' + created_at: + type: integer + last_error: + $ref: '#/components/schemas/VectorStoreFileLastError' + status: + oneOf: + - type: string + const: completed + - type: string + const: in_progress + - type: string + const: cancelled + - type: string + const: failed + usage_bytes: + type: integer + default: 0 + vector_store_id: + type: string + additionalProperties: false + required: + - id + - object + - attributes + - chunking_strategy + - created_at + - status + - usage_bytes + - vector_store_id + title: VectorStoreFileObject + description: OpenAI Vector Store File object. OpenAIJSONSchema: type: object properties: @@ -8673,6 +8903,10 @@ components: type: string prompt_logprobs: type: integer + suffix: + type: string + description: >- + (Optional) The suffix that should be appended to the completion. additionalProperties: false required: - model @@ -9526,7 +9760,13 @@ components: mode: type: string description: >- - Search mode for retrieval—either "vector" or "keyword". Default "vector". + Search mode for retrieval—either "vector", "keyword", or "hybrid". Default + "vector". + ranker: + $ref: '#/components/schemas/Ranker' + description: >- + Configuration for the ranker to use in hybrid search. Defaults to RRF + ranker. additionalProperties: false required: - query_generator_config @@ -9545,6 +9785,58 @@ components: mapping: default: '#/components/schemas/DefaultRAGQueryGeneratorConfig' llm: '#/components/schemas/LLMRAGQueryGeneratorConfig' + RRFRanker: + type: object + properties: + type: + type: string + const: rrf + default: rrf + description: The type of ranker, always "rrf" + impact_factor: + type: number + default: 60.0 + description: >- + The impact factor for RRF scoring. Higher values give more weight to higher-ranked + results. Must be greater than 0. Default of 60 is from the original RRF + paper (Cormack et al., 2009). + additionalProperties: false + required: + - type + - impact_factor + title: RRFRanker + description: >- + Reciprocal Rank Fusion (RRF) ranker configuration. + Ranker: + oneOf: + - $ref: '#/components/schemas/RRFRanker' + - $ref: '#/components/schemas/WeightedRanker' + discriminator: + propertyName: type + mapping: + rrf: '#/components/schemas/RRFRanker' + weighted: '#/components/schemas/WeightedRanker' + WeightedRanker: + type: object + properties: + type: + type: string + const: weighted + default: weighted + description: The type of ranker, always "weighted" + alpha: + type: number + default: 0.5 + description: >- + Weight factor between 0 and 1. 0 means only use keyword scores, 1 means + only use vector scores, values in between blend both scores. + additionalProperties: false + required: + - type + - alpha + title: WeightedRanker + description: >- + Weighted ranker configuration that combines vector and keyword scores. QueryRequest: type: object properties: diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 4d148feda..e09c79359 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -18,6 +18,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | +| files | `inline::localfs` | | inference | `remote::ollama` | | post_training | `inline::huggingface` | | safety | `inline::llama-guard` | diff --git a/docs/source/providers/vector_io/sqlite-vec.md b/docs/source/providers/vector_io/sqlite-vec.md index 49ba659f7..3c7c4cbee 100644 --- a/docs/source/providers/vector_io/sqlite-vec.md +++ b/docs/source/providers/vector_io/sqlite-vec.md @@ -66,25 +66,126 @@ To use sqlite-vec in your Llama Stack project, follow these steps: 2. Configure your Llama Stack project to use SQLite-Vec. 3. Start storing and querying vectors. -## Supported Search Modes +The SQLite-vec provider supports three search modes: -The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes. - -When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in -`RAGQueryConfig`. For example: +1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings. +2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5. +3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates. +Example with hybrid search: ```python -from llama_stack.apis.tool_runtime.rag import RAGQueryConfig +response = await vector_io.query_chunks( + vector_db_id="my_db", + query="your query here", + params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, +) -query_config = RAGQueryConfig(max_chunks=6, mode="vector") +# Using RRF ranker +response = await vector_io.query_chunks( + vector_db_id="my_db", + query="your query here", + params={ + "mode": "hybrid", + "max_chunks": 3, + "score_threshold": 0.7, + "ranker": {"type": "rrf", "impact_factor": 60.0}, + }, +) -results = client.tool_runtime.rag_tool.query( - vector_db_ids=[vector_db_id], - content="what is torchtune", - query_config=query_config, +# Using weighted ranker +response = await vector_io.query_chunks( + vector_db_id="my_db", + query="your query here", + params={ + "mode": "hybrid", + "max_chunks": 3, + "score_threshold": 0.7, + "ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword + }, ) ``` +Example with explicit vector search: +```python +response = await vector_io.query_chunks( + vector_db_id="my_db", + query="your query here", + params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, +) +``` + +Example with keyword search: +```python +response = await vector_io.query_chunks( + vector_db_id="my_db", + query="your query here", + params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, +) +``` + +## Supported Search Modes + +The SQLite vector store supports three search modes: + +1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks +2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks +3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker + +### Hybrid Search + +Hybrid search combines the strengths of both vector and keyword search by: +- Computing vector similarity scores +- Computing keyword match scores +- Using a ranker to combine these scores + +Two ranker types are supported: + +1. **RRF (Reciprocal Rank Fusion)**: + - Combines ranks from both vector and keyword results + - Uses an impact factor (default: 60.0) to control the weight of higher-ranked results + - Good for balancing between vector and keyword results + - The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks + +2. **Weighted**: + - Linearly combines normalized vector and keyword scores + - Uses an alpha parameter (0-1) to control the blend: + - alpha=0: Only use keyword scores + - alpha=1: Only use vector scores + - alpha=0.5: Equal weight to both (default) + +Example using RAGQueryConfig with different search modes: + +```python +from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker + +# Vector search +config = RAGQueryConfig(mode="vector", max_chunks=5) + +# Keyword search +config = RAGQueryConfig(mode="keyword", max_chunks=5) + +# Hybrid search with custom RRF ranker +config = RAGQueryConfig( + mode="hybrid", + max_chunks=5, + ranker=RRFRanker(impact_factor=50.0), # Custom impact factor +) + +# Hybrid search with weighted ranker +config = RAGQueryConfig( + mode="hybrid", + max_chunks=5, + ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword +) + +# Hybrid search with default RRF ranker +config = RAGQueryConfig( + mode="hybrid", max_chunks=5 +) # Will use RRF with impact_factor=60.0 +``` + +Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored. + ## Installation You can install SQLite-Vec using pip: @@ -96,3 +197,5 @@ pip install sqlite-vec ## Documentation See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general. + +[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759). diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 35b3d5ace..2e1cb257a 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel): type: Literal["web_search_call"] = "web_search_call" +@json_schema_type +class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel): + id: str + queries: list[str] + status: str + type: Literal["file_search_call"] = "file_search_call" + results: list[dict[str, Any]] | None = None + + @json_schema_type class OpenAIResponseOutputMessageFunctionToolCall(BaseModel): call_id: str @@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel): OpenAIResponseOutput = Annotated[ OpenAIResponseMessage | OpenAIResponseOutputMessageWebSearchToolCall + | OpenAIResponseOutputMessageFileSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseOutputMessageMCPCall | OpenAIResponseOutputMessageMCPListTools, @@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel): OpenAIResponseInput = Annotated[ # Responses API allows output messages to be passed in as input OpenAIResponseOutputMessageWebSearchToolCall + | OpenAIResponseOutputMessageFileSearchToolCall | OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseInputFunctionToolCallOutput | @@ -397,9 +408,10 @@ class FileSearchRankingOptions(BaseModel): @json_schema_type class OpenAIResponseInputToolFileSearch(BaseModel): type: Literal["file_search"] = "file_search" - vector_store_id: list[str] + vector_store_ids: list[str] + filters: dict[str, Any] | None = None + max_num_results: int | None = Field(default=10, ge=1, le=50) ranking_options: FileSearchRankingOptions | None = None - # TODO: add filters class ApprovalFilter(BaseModel): diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 74697dd18..c440794f3 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -1038,6 +1038,8 @@ class InferenceProvider(Protocol): # vLLM-specific parameters guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + # for fill-in-the-middle type completion + suffix: str | None = None, ) -> OpenAICompletion: """Generate an OpenAI-compatible completion for the given prompt using the specified model. @@ -1058,6 +1060,7 @@ class InferenceProvider(Protocol): :param temperature: (Optional) The temperature to use. :param top_p: (Optional) The top p to use. :param user: (Optional) The user to use. + :param suffix: (Optional) The suffix that should be appended to the completion. :returns: An OpenAICompletion. """ ... diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py index 1e3542f74..72f68b7cb 100644 --- a/llama_stack/apis/tools/rag_tool.py +++ b/llama_stack/apis/tools/rag_tool.py @@ -15,6 +15,48 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.schema_utils import json_schema_type, register_schema, webmethod +@json_schema_type +class RRFRanker(BaseModel): + """ + Reciprocal Rank Fusion (RRF) ranker configuration. + + :param type: The type of ranker, always "rrf" + :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. + Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009). + """ + + type: Literal["rrf"] = "rrf" + impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance + + +@json_schema_type +class WeightedRanker(BaseModel): + """ + Weighted ranker configuration that combines vector and keyword scores. + + :param type: The type of ranker, always "weighted" + :param alpha: Weight factor between 0 and 1. + 0 means only use keyword scores, + 1 means only use vector scores, + values in between blend both scores. + """ + + type: Literal["weighted"] = "weighted" + alpha: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.", + ) + + +Ranker = Annotated[ + RRFRanker | WeightedRanker, + Field(discriminator="type"), +] +register_schema(Ranker, name="Ranker") + + @json_schema_type class RAGDocument(BaseModel): """ @@ -76,7 +118,8 @@ class RAGQueryConfig(BaseModel): :param chunk_template: Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n" - :param mode: Search mode for retrieval—either "vector" or "keyword". Default "vector". + :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector". + :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker. """ # This config defines how a query is generated using the messages @@ -86,6 +129,7 @@ class RAGQueryConfig(BaseModel): max_chunks: int = 5 chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n" mode: str | None = None + ranker: Ranker | None = Field(default=None) # Only used for hybrid mode @field_validator("chunk_template") def validate_chunk_template(cls, v: str) -> str: diff --git a/llama_stack/apis/vector_io/vector_io.py b/llama_stack/apis/vector_io/vector_io.py index 1c8ae4dab..77d4cfc5a 100644 --- a/llama_stack/apis/vector_io/vector_io.py +++ b/llama_stack/apis/vector_io/vector_io.py @@ -8,7 +8,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, Literal, Protocol, runtime_checkable +from typing import Annotated, Any, Literal, Protocol, runtime_checkable from pydantic import BaseModel, Field @@ -16,6 +16,7 @@ from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.vector_dbs import VectorDB from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.schema_utils import json_schema_type, webmethod +from llama_stack.strong_typing.schema import register_schema class Chunk(BaseModel): @@ -133,6 +134,50 @@ class VectorStoreDeleteResponse(BaseModel): deleted: bool = True +@json_schema_type +class VectorStoreChunkingStrategyAuto(BaseModel): + type: Literal["auto"] = "auto" + + +@json_schema_type +class VectorStoreChunkingStrategyStaticConfig(BaseModel): + chunk_overlap_tokens: int = 400 + max_chunk_size_tokens: int = Field(800, ge=100, le=4096) + + +@json_schema_type +class VectorStoreChunkingStrategyStatic(BaseModel): + type: Literal["static"] = "static" + static: VectorStoreChunkingStrategyStaticConfig + + +VectorStoreChunkingStrategy = Annotated[ + VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic, Field(discriminator="type") +] +register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy") + + +@json_schema_type +class VectorStoreFileLastError(BaseModel): + code: Literal["server_error"] | Literal["rate_limit_exceeded"] + message: str + + +@json_schema_type +class VectorStoreFileObject(BaseModel): + """OpenAI Vector Store File object.""" + + id: str + object: str = "vector_store.file" + attributes: dict[str, Any] = Field(default_factory=dict) + chunking_strategy: VectorStoreChunkingStrategy + created_at: int + last_error: VectorStoreFileLastError | None = None + status: Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"] + usage_bytes: int = 0 + vector_store_id: str + + class VectorDBStore(Protocol): def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ... @@ -290,3 +335,21 @@ class VectorIO(Protocol): :returns: A VectorStoreSearchResponse containing the search results. """ ... + + @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST") + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + """Attach a file to a vector store. + + :param vector_store_id: The ID of the vector store to attach the file to. + :param file_id: The ID of the file to attach to the vector store. + :param attributes: The key-value attributes stored with the file, which can be used for filtering. + :param chunking_strategy: The chunking strategy to use for the file. + :returns: A VectorStoreFileObject representing the attached file. + """ + ... diff --git a/llama_stack/distribution/routers/inference.py b/llama_stack/distribution/routers/inference.py index 62d04cdc4..4e0a33b59 100644 --- a/llama_stack/distribution/routers/inference.py +++ b/llama_stack/distribution/routers/inference.py @@ -426,6 +426,7 @@ class InferenceRouter(Inference): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: logger.debug( f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", @@ -456,6 +457,7 @@ class InferenceRouter(Inference): user=user, guided_choice=guided_choice, prompt_logprobs=prompt_logprobs, + suffix=suffix, ) provider = self.routing_table.get_provider_impl(model_obj.identifier) diff --git a/llama_stack/distribution/routers/vector_io.py b/llama_stack/distribution/routers/vector_io.py index 3d65aef24..8eb56b7ca 100644 --- a/llama_stack/distribution/routers/vector_io.py +++ b/llama_stack/distribution/routers/vector_io.py @@ -19,6 +19,7 @@ from llama_stack.apis.vector_io import ( VectorStoreObject, VectorStoreSearchResponsePage, ) +from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject from llama_stack.log import get_logger from llama_stack.providers.datatypes import RoutingTable @@ -254,3 +255,20 @@ class VectorIORouter(VectorIO): ranking_options=ranking_options, rewrite_query=rewrite_query, ) + + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}") + # Route based on vector store ID + provider = self.routing_table.get_provider_impl(vector_store_id) + return await provider.openai_attach_file_to_vector_store( + vector_store_id=vector_store_id, + file_id=file_id, + attributes=attributes, + chunking_strategy=chunking_strategy, + ) diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py index 0ff6dc2c5..33fcbfa5d 100644 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py @@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseInputMessageContentImage, OpenAIResponseInputMessageContentText, OpenAIResponseInputTool, + OpenAIResponseInputToolFileSearch, OpenAIResponseInputToolMCP, OpenAIResponseMessage, OpenAIResponseObject, @@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseOutput, OpenAIResponseOutputMessageContent, OpenAIResponseOutputMessageContentOutputText, + OpenAIResponseOutputMessageFileSearchToolCall, OpenAIResponseOutputMessageFunctionToolCall, OpenAIResponseOutputMessageMCPListTools, OpenAIResponseOutputMessageWebSearchToolCall, @@ -62,7 +64,7 @@ from llama_stack.apis.inference.inference import ( OpenAIToolMessageParam, OpenAIUserMessageParam, ) -from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime +from llama_stack.apis.tools import RAGQueryConfig, ToolGroups, ToolRuntime from llama_stack.log import get_logger from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool @@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel): class ChatCompletionContext(BaseModel): model: str messages: list[OpenAIMessageParam] - tools: list[ChatCompletionToolParam] | None = None + response_tools: list[OpenAIResponseInputTool] | None = None + chat_tools: list[ChatCompletionToolParam] | None = None mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] temperature: float | None response_format: OpenAIResponseFormatParam @@ -388,7 +391,8 @@ class OpenAIResponsesImpl: ctx = ChatCompletionContext( model=model, messages=messages, - tools=chat_tools, + response_tools=tools, + chat_tools=chat_tools, mcp_tool_to_server=mcp_tool_to_server, temperature=temperature, response_format=response_format, @@ -417,7 +421,7 @@ class OpenAIResponsesImpl: completion_result = await self.inference_api.openai_chat_completion( model=ctx.model, messages=messages, - tools=ctx.tools, + tools=ctx.chat_tools, stream=True, temperature=ctx.temperature, response_format=ctx.response_format, @@ -606,6 +610,12 @@ class OpenAIResponsesImpl: if not tool: raise ValueError(f"Tool {tool_name} not found") chat_tools.append(make_openai_tool(tool_name, tool)) + elif input_tool.type == "file_search": + tool_name = "knowledge_search" + tool = await self.tool_groups_api.get_tool(tool_name) + if not tool: + raise ValueError(f"Tool {tool_name} not found") + chat_tools.append(make_openai_tool(tool_name, tool)) elif input_tool.type == "mcp": always_allowed = None never_allowed = None @@ -667,6 +677,7 @@ class OpenAIResponsesImpl: tool_call_id = tool_call.id function = tool_call.function + tool_kwargs = json.loads(function.arguments) if function.arguments else {} if not function or not tool_call_id or not function.name: return None, None @@ -680,12 +691,26 @@ class OpenAIResponsesImpl: endpoint=mcp_tool.server_url, headers=mcp_tool.headers or {}, tool_name=function.name, - kwargs=json.loads(function.arguments) if function.arguments else {}, + kwargs=tool_kwargs, ) else: + if function.name == "knowledge_search": + response_file_search_tool = next( + t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch) + ) + if response_file_search_tool: + if response_file_search_tool.filters: + logger.warning("Filters are not yet supported for file_search tool") + if response_file_search_tool.ranking_options: + logger.warning("Ranking options are not yet supported for file_search tool") + tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids + tool_kwargs["query_config"] = RAGQueryConfig( + mode="vector", + max_chunks=response_file_search_tool.max_num_results, + ) result = await self.tool_runtime_api.invoke_tool( tool_name=function.name, - kwargs=json.loads(function.arguments) if function.arguments else {}, + kwargs=tool_kwargs, ) except Exception as e: error_exc = e @@ -713,6 +738,27 @@ class OpenAIResponsesImpl: ) if error_exc or (result.error_code and result.error_code > 0) or result.error_message: message.status = "failed" + elif function.name == "knowledge_search": + message = OpenAIResponseOutputMessageFileSearchToolCall( + id=tool_call_id, + queries=[tool_kwargs.get("query", "")], + status="completed", + ) + if "document_ids" in result.metadata: + message.results = [] + for i, doc_id in enumerate(result.metadata["document_ids"]): + text = result.metadata["chunks"][i] if "chunks" in result.metadata else None + score = result.metadata["scores"][i] if "scores" in result.metadata else None + message.results.append( + { + "file_id": doc_id, + "filename": doc_id, + "text": text, + "score": score, + } + ) + if error_exc or (result.error_code and result.error_code > 0) or result.error_message: + message.status = "failed" else: raise ValueError(f"Unknown tool {function.name} called") diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py index 4776d47d0..7f4fe5dbd 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/memory.py +++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py @@ -121,8 +121,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti vector_db_id=vector_db_id, query=query, params={ - "max_chunks": query_config.max_chunks, "mode": query_config.mode, + "max_chunks": query_config.max_chunks, + "score_threshold": 0.0, + "ranker": query_config.ranker, }, ) for vector_db_id in vector_db_ids @@ -170,6 +172,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti content=picked, metadata={ "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]], + "chunks": [c.content for c in chunks[: len(picked)]], + "scores": scores[: len(picked)], }, ) diff --git a/llama_stack/providers/inline/vector_io/faiss/__init__.py b/llama_stack/providers/inline/vector_io/faiss/__init__.py index 68a1dee66..dd1c59b7b 100644 --- a/llama_stack/providers/inline/vector_io/faiss/__init__.py +++ b/llama_stack/providers/inline/vector_io/faiss/__init__.py @@ -16,6 +16,6 @@ async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]): assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}" - impl = FaissVectorIOAdapter(config, deps[Api.inference]) + impl = FaissVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None)) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py index 5e9155011..a2f4417e0 100644 --- a/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -15,6 +15,7 @@ import faiss import numpy as np from numpy.typing import NDArray +from llama_stack.apis.files import Files from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.inference.inference import Inference from llama_stack.apis.vector_dbs import VectorDB @@ -130,11 +131,23 @@ class FaissIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in FAISS") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in FAISS") + class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): - def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None: + def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None: self.config = config self.inference_api = inference_api + self.files_api = files_api self.cache: dict[str, VectorDBWithIndex] = {} self.kvstore: KVStore | None = None self.openai_vector_stores: dict[str, dict[str, Any]] = {} diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py index 6db176eda..e5200a755 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py @@ -15,6 +15,6 @@ async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]): from .sqlite_vec import SQLiteVecVectorIOAdapter assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}" - impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference]) + impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None)) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py index 02f04e766..c6712882a 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py @@ -17,6 +17,7 @@ import numpy as np import sqlite_vec from numpy.typing import NDArray +from llama_stack.apis.files.files import Files from llama_stack.apis.inference.inference import Inference from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_io import ( @@ -26,14 +27,20 @@ from llama_stack.apis.vector_io import ( ) from llama_stack.providers.datatypes import VectorDBsProtocolPrivate from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin -from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex +from llama_stack.providers.utils.memory.vector_store import ( + RERANKER_TYPE_RRF, + RERANKER_TYPE_WEIGHTED, + EmbeddingIndex, + VectorDBWithIndex, +) logger = logging.getLogger(__name__) # Specifying search mode is dependent on the VectorIO provider. VECTOR_SEARCH = "vector" KEYWORD_SEARCH = "keyword" -SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH} +HYBRID_SEARCH = "hybrid" +SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH} def serialize_vector(vector: list[float]) -> bytes: @@ -50,6 +57,59 @@ def _create_sqlite_connection(db_path): return connection +def _normalize_scores(scores: dict[str, float]) -> dict[str, float]: + """Normalize scores to [0,1] range using min-max normalization.""" + if not scores: + return {} + min_score = min(scores.values()) + max_score = max(scores.values()) + score_range = max_score - min_score + if score_range > 0: + return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()} + return {doc_id: 1.0 for doc_id in scores} + + +def _weighted_rerank( + vector_scores: dict[str, float], + keyword_scores: dict[str, float], + alpha: float = 0.5, +) -> dict[str, float]: + """ReRanker that uses weighted average of scores.""" + all_ids = set(vector_scores.keys()) | set(keyword_scores.keys()) + normalized_vector_scores = _normalize_scores(vector_scores) + normalized_keyword_scores = _normalize_scores(keyword_scores) + + return { + doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0)) + + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0)) + for doc_id in all_ids + } + + +def _rrf_rerank( + vector_scores: dict[str, float], + keyword_scores: dict[str, float], + impact_factor: float = 60.0, +) -> dict[str, float]: + """ReRanker that uses Reciprocal Rank Fusion.""" + # Convert scores to ranks + vector_ranks = { + doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True)) + } + keyword_ranks = { + doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)) + } + + all_ids = set(vector_scores.keys()) | set(keyword_scores.keys()) + rrf_scores = {} + for doc_id in all_ids: + vector_rank = vector_ranks.get(doc_id, float("inf")) + keyword_rank = keyword_ranks.get(doc_id, float("inf")) + # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank + rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank)) + return rrf_scores + + class SQLiteVecIndex(EmbeddingIndex): """ An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec. @@ -254,8 +314,6 @@ class SQLiteVecIndex(EmbeddingIndex): """ Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search. """ - if query_string is None: - raise ValueError("query_string is required for keyword search.") def _execute_query(): connection = _create_sqlite_connection(self.db_path) @@ -293,6 +351,81 @@ class SQLiteVecIndex(EmbeddingIndex): scores.append(score) return QueryChunksResponse(chunks=chunks, scores=scores) + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str = RERANKER_TYPE_RRF, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + """ + Hybrid search using a configurable re-ranking strategy. + + Args: + embedding: The query embedding vector + query_string: The text query for keyword search + k: Number of results to return + score_threshold: Minimum similarity score threshold + reranker_type: Type of reranker to use ("rrf" or "weighted") + reranker_params: Parameters for the reranker + + Returns: + QueryChunksResponse with combined results + """ + if reranker_params is None: + reranker_params = {} + + # Get results from both search methods + vector_response = await self.query_vector(embedding, k, score_threshold) + keyword_response = await self.query_keyword(query_string, k, score_threshold) + + # Convert responses to score dictionaries using generate_chunk_id + vector_scores = { + generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score + for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False) + } + keyword_scores = { + generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score + for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False) + } + + # Combine scores using the specified reranker + if reranker_type == RERANKER_TYPE_WEIGHTED: + alpha = reranker_params.get("alpha", 0.5) + combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha) + else: + # Default to RRF for None, RRF, or any unknown types + impact_factor = reranker_params.get("impact_factor", 60.0) + combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor) + + # Sort by combined score and get top k results + sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) + top_k_items = sorted_items[:k] + + # Filter by score threshold + filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold] + + # Create a map of chunk_id to chunk for both responses + chunk_map = {} + for c in vector_response.chunks: + chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content)) + chunk_map[chunk_id] = c + for c in keyword_response.chunks: + chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content)) + chunk_map[chunk_id] = c + + # Use the map to look up chunks by their IDs + chunks = [] + scores = [] + for doc_id, score in filtered_items: + if doc_id in chunk_map: + chunks.append(chunk_map[doc_id]) + scores.append(score) + + return QueryChunksResponse(chunks=chunks, scores=scores) + class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): """ @@ -301,9 +434,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex). """ - def __init__(self, config, inference_api: Inference) -> None: + def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None: self.config = config self.inference_api = inference_api + self.files_api = files_api self.cache: dict[str, VectorDBWithIndex] = {} self.openai_vector_stores: dict[str, dict[str, Any]] = {} @@ -343,7 +477,9 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc vector_db_data = row[0] vector_db = VectorDB.model_validate_json(vector_db_data) index = await SQLiteVecIndex.create( - vector_db.embedding_dimension, self.config.db_path, vector_db.identifier + vector_db.embedding_dimension, + self.config.db_path, + vector_db.identifier, ) self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api) @@ -369,7 +505,11 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc connection.close() await asyncio.to_thread(_register_db) - index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier) + index = await SQLiteVecIndex.create( + vector_db.embedding_dimension, + self.config.db_path, + vector_db.identifier, + ) self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api) async def list_vector_dbs(self) -> list[VectorDB]: diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index d888c8420..55c1b5617 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -24,6 +24,7 @@ def available_providers() -> list[ProviderSpec]: config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", deprecation_warning="Please use the `inline::faiss` provider instead.", api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), InlineProviderSpec( api=Api.vector_io, @@ -32,6 +33,7 @@ def available_providers() -> list[ProviderSpec]: module="llama_stack.providers.inline.vector_io.faiss", config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), # NOTE: sqlite-vec cannot be bundled into the container image because it does not have a # source distribution and the wheels are not available for all platforms. @@ -42,6 +44,7 @@ def available_providers() -> list[ProviderSpec]: module="llama_stack.providers.inline.vector_io.sqlite_vec", config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), InlineProviderSpec( api=Api.vector_io, @@ -51,6 +54,7 @@ def available_providers() -> list[ProviderSpec]: config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.", api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), remote_provider_spec( Api.vector_io, diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 75a9e33e2..79b1b5f08 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -318,6 +318,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(model) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 4c68322e0..cb6c6e279 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -316,6 +316,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: provider_model_id = await self._get_provider_model_id(model) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 358a29d4c..d51072fbf 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -33,7 +33,6 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, LogProbConfig, Message, - OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, TextTruncation, @@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, + OpenAIEmbeddingsResponse, + OpenAIEmbeddingUsage, OpenAIMessageParam, OpenAIResponseFormatParam, ) @@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import ( from llama_stack.providers.utils.inference.openai_compat import ( OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + b64_encode_openai_embeddings_response, get_sampling_options, prepare_openai_completion_params, + prepare_openai_embeddings_params, process_chat_completion_response, process_chat_completion_stream_response, process_completion_response, @@ -386,7 +389,35 @@ class OllamaInferenceAdapter( dimensions: int | None = None, user: str | None = None, ) -> OpenAIEmbeddingsResponse: - raise NotImplementedError() + model_obj = await self._get_model(model) + if model_obj.model_type != ModelType.embedding: + raise ValueError(f"Model {model} is not an embedding model") + + if model_obj.provider_resource_id is None: + raise ValueError(f"Model {model} has no provider_resource_id set") + + # Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters + params = prepare_openai_embeddings_params( + model=model_obj.provider_resource_id, + input=input, + encoding_format=encoding_format, + dimensions=dimensions, + user=user, + ) + + response = await self.openai_client.embeddings.create(**params) + data = b64_encode_openai_embeddings_response(response.data, encoding_format) + + usage = OpenAIEmbeddingUsage( + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + ) + # TODO: Investigate why model_obj.identifier is used instead of response.model + return OpenAIEmbeddingsResponse( + data=data, + model=model_obj.identifier, + usage=usage, + ) async def openai_completion( self, @@ -409,6 +440,7 @@ class OllamaInferenceAdapter( user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: if not isinstance(prompt, str): raise ValueError("Ollama does not support non-string prompts for completion") @@ -432,6 +464,7 @@ class OllamaInferenceAdapter( temperature=temperature, top_p=top_p, user=user, + suffix=suffix, ) return await self.openai_client.completions.create(**params) # type: ignore diff --git a/llama_stack/providers/remote/inference/openai/openai.py b/llama_stack/providers/remote/inference/openai/openai.py index 6f3a686a8..ed4ec22aa 100644 --- a/llama_stack/providers/remote/inference/openai/openai.py +++ b/llama_stack/providers/remote/inference/openai/openai.py @@ -90,6 +90,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: if guided_choice is not None: logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.") @@ -117,6 +118,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin): temperature=temperature, top_p=top_p, user=user, + suffix=suffix, ) return await self._openai_client.completions.create(**params) diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 6cf4680e2..e9660abb9 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -242,6 +242,7 @@ class PassthroughInferenceAdapter(Inference): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: client = self._get_client() model_obj = await self.model_store.get_model(model) diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index 7305a638d..7030a644d 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -299,6 +299,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index d0a822f3c..16d133c81 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -559,6 +559,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: self._lazy_initialize_client() model_obj = await self._get_model(model) diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py index 3ec5fce66..eedeb7baf 100644 --- a/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -313,6 +313,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper): user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 0d8451eb2..027cdcb11 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import ( VectorStoreObject, VectorStoreSearchResponsePage, ) +from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig from llama_stack.providers.utils.memory.vector_store import ( @@ -104,6 +105,17 @@ class ChromaIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in Chroma") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in Chroma") + class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__( @@ -241,3 +253,12 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): rewrite_query: bool | None = False, ) -> VectorStoreSearchResponsePage: raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma") + + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma") diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py index 8ae74aedc..42ab4fa3e 100644 --- a/llama_stack/providers/remote/vector_io/milvus/milvus.py +++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py @@ -25,6 +25,7 @@ from llama_stack.apis.vector_io import ( VectorStoreObject, VectorStoreSearchResponsePage, ) +from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig from llama_stack.providers.utils.memory.vector_store import ( @@ -102,6 +103,17 @@ class MilvusIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in Milvus") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in Milvus") + class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__( @@ -240,6 +252,15 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): ) -> VectorStoreSearchResponsePage: raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant") + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus") + def generate_chunk_id(document_id: str, chunk_text: str) -> str: """Generate a unique chunk ID using a hash of document ID and chunk text.""" diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py index 7d58a49f3..1917af086 100644 --- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py +++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py @@ -128,6 +128,17 @@ class PGVectorIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in PGVector") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in PGVector") + async def delete(self): with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(f"DROP TABLE IF EXISTS {self.table_name}") diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 10f3b5b0d..fa7782f04 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import ( VectorStoreObject, VectorStoreSearchResponsePage, ) +from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig from llama_stack.providers.utils.memory.vector_store import ( @@ -111,6 +112,17 @@ class QdrantIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in Qdrant") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in Qdrant") + async def delete(self): await self.client.delete_collection(collection_name=self.collection_name) @@ -241,3 +253,12 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): rewrite_query: bool | None = False, ) -> VectorStoreSearchResponsePage: raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant") + + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant") diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py index 6f2027dad..c63dd70c6 100644 --- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py +++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py @@ -92,6 +92,17 @@ class WeaviateIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in Weaviate") + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError("Hybrid search is not supported in Weaviate") + class WeaviateVectorIOAdapter( VectorIO, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index dab10bc55..c21f379c9 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -4,8 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import base64 -import struct from collections.abc import AsyncGenerator, AsyncIterator from typing import Any @@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAIChatCompletionChunk, OpenAICompletion, - OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, OpenAIMessageParam, @@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( + b64_encode_openai_embeddings_response, convert_message_to_openai_dict_new, convert_openai_chat_completion_choice, convert_openai_chat_completion_stream, @@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin( ) # Convert response to OpenAI format - data = [] - for i, embedding_data in enumerate(response["data"]): - # we encode to base64 if the encoding format is base64 in the request - if encoding_format == "base64": - byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"]) - embedding = base64.b64encode(byte_data).decode("utf-8") - else: - embedding = embedding_data["embedding"] - - data.append(OpenAIEmbeddingData(embedding=embedding, index=i)) + data = b64_encode_openai_embeddings_response(response.data, encoding_format) usage = OpenAIEmbeddingUsage( prompt_tokens=response["usage"]["prompt_tokens"], @@ -336,6 +325,7 @@ class LiteLLMOpenAIMixin( user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: model_obj = await self.model_store.get_model(model) params = await prepare_openai_completion_params( diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 049f06fdb..ff95b12a7 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -3,8 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 import json import logging +import struct import time import uuid import warnings @@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice, + OpenAIEmbeddingData, OpenAIMessageParam, OpenAIResponseFormatParam, ToolConfig, @@ -1287,6 +1290,7 @@ class OpenAICompletionToLlamaStackMixin: user: str | None = None, guided_choice: list[str] | None = None, prompt_logprobs: int | None = None, + suffix: str | None = None, ) -> OpenAICompletion: if stream: raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions") @@ -1483,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin: model=model, object="chat.completion", ) + + +def prepare_openai_embeddings_params( + model: str, + input: str | list[str], + encoding_format: str | None = "float", + dimensions: int | None = None, + user: str | None = None, +): + if model is None: + raise ValueError("Model must be provided for embeddings") + + input_list = [input] if isinstance(input, str) else input + + params: dict[str, Any] = { + "model": model, + "input": input_list, + } + + if encoding_format is not None: + params["encoding_format"] = encoding_format + if dimensions is not None: + params["dimensions"] = dimensions + if user is not None: + params["user"] = user + + return params + + +def b64_encode_openai_embeddings_response( + response_data: dict, encoding_format: str | None = "float" +) -> list[OpenAIEmbeddingData]: + """ + Process the OpenAI embeddings response to encode the embeddings in base64 format if specified. + """ + data = [] + for i, embedding_data in enumerate(response_data): + if encoding_format == "base64": + byte_array = bytearray() + for embedding_value in embedding_data.embedding: + byte_array.extend(struct.pack("f", float(embedding_value))) + + response_embedding = base64.b64encode(byte_array).decode("utf-8") + else: + response_embedding = embedding_data.embedding + data.append( + OpenAIEmbeddingData( + embedding=response_embedding, + index=i, + ) + ) + return data diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 7d8163ed2..f9701897a 100644 --- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -5,11 +5,13 @@ # the root directory of this source tree. import logging +import mimetypes import time import uuid from abc import ABC, abstractmethod from typing import Any +from llama_stack.apis.files import Files from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_io import ( QueryChunksResponse, @@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import ( VectorStoreSearchResponse, VectorStoreSearchResponsePage, ) +from llama_stack.apis.vector_io.vector_io import ( + Chunk, + VectorStoreChunkingStrategy, + VectorStoreChunkingStrategyAuto, + VectorStoreChunkingStrategyStatic, + VectorStoreFileLastError, + VectorStoreFileObject, +) +from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks logger = logging.getLogger(__name__) @@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC): # These should be provided by the implementing class openai_vector_stores: dict[str, dict[str, Any]] + files_api: Files | None @abstractmethod async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None: @@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC): """Unregister a vector database (provider-specific implementation).""" pass + @abstractmethod + async def insert_chunks( + self, + vector_db_id: str, + chunks: list[Chunk], + ttl_seconds: int | None = None, + ) -> None: + """Insert chunks into a vector database (provider-specific implementation).""" + pass + @abstractmethod async def query_chunks( self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None @@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC): if metadata[key] != value: return False return True + + async def openai_attach_file_to_vector_store( + self, + vector_store_id: str, + file_id: str, + attributes: dict[str, Any] | None = None, + chunking_strategy: VectorStoreChunkingStrategy | None = None, + ) -> VectorStoreFileObject: + attributes = attributes or {} + chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto() + + vector_store_file_object = VectorStoreFileObject( + id=file_id, + attributes=attributes, + chunking_strategy=chunking_strategy, + created_at=int(time.time()), + status="in_progress", + vector_store_id=vector_store_id, + ) + + if not hasattr(self, "files_api") or not self.files_api: + vector_store_file_object.status = "failed" + vector_store_file_object.last_error = VectorStoreFileLastError( + code="server_error", + message="Files API is not available", + ) + return vector_store_file_object + + if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic): + max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens + chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens + else: + # Default values from OpenAI API spec + max_chunk_size_tokens = 800 + chunk_overlap_tokens = 400 + + try: + file_response = await self.files_api.openai_retrieve_file(file_id) + mime_type, _ = mimetypes.guess_type(file_response.filename) + content_response = await self.files_api.openai_retrieve_file_content(file_id) + + content = content_from_data_and_mime_type(content_response.body, mime_type) + + chunks = make_overlapped_chunks( + file_id, + content, + max_chunk_size_tokens, + chunk_overlap_tokens, + attributes, + ) + + if not chunks: + vector_store_file_object.status = "failed" + vector_store_file_object.last_error = VectorStoreFileLastError( + code="server_error", + message="No chunks were generated from the file", + ) + return vector_store_file_object + + await self.insert_chunks( + vector_db_id=vector_store_id, + chunks=chunks, + ) + except Exception as e: + logger.error(f"Error attaching file to vector store: {e}") + vector_store_file_object.status = "failed" + vector_store_file_object.last_error = VectorStoreFileLastError( + code="server_error", + message=str(e), + ) + return vector_store_file_object + + vector_store_file_object.status = "completed" + + return vector_store_file_object diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index 4cd15860b..a6e420feb 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( log = logging.getLogger(__name__) +# Constants for reranker types +RERANKER_TYPE_RRF = "rrf" +RERANKER_TYPE_WEIGHTED = "weighted" + def parse_pdf(data: bytes) -> str: # For PDF and DOC/DOCX files, we can't reliably convert to string @@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str: data = unquote(data) encoding = parts["encoding"] or "utf-8" data = data.encode(encoding) + return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None)) - encoding = parts["encoding"] - if not encoding: - import chardet - detected = chardet.detect(data) - encoding = detected["encoding"] +def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str: + if isinstance(data, bytes): + if not encoding: + import chardet - mime_type = parts["mimetype"] - mime_category = mime_type.split("/")[0] + detected = chardet.detect(data) + encoding = detected["encoding"] + + mime_category = mime_type.split("/")[0] if mime_type else None if mime_category == "text": # For text-based files (including CSV, MD) return data.decode(encoding) @@ -200,6 +206,18 @@ class EmbeddingIndex(ABC): async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse: raise NotImplementedError() + @abstractmethod + async def query_hybrid( + self, + embedding: NDArray, + query_string: str, + k: int, + score_threshold: float, + reranker_type: str, + reranker_params: dict[str, Any] | None = None, + ) -> QueryChunksResponse: + raise NotImplementedError() + @abstractmethod async def delete(self): raise NotImplementedError() @@ -243,10 +261,29 @@ class VectorDBWithIndex: k = params.get("max_chunks", 3) mode = params.get("mode") score_threshold = params.get("score_threshold", 0.0) + + # Get ranker configuration + ranker = params.get("ranker") + if ranker is None: + # Default to RRF with impact_factor=60.0 + reranker_type = RERANKER_TYPE_RRF + reranker_params = {"impact_factor": 60.0} + else: + reranker_type = ranker.type + reranker_params = ( + {"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha} + ) + query_string = interleaved_content_as_str(query) if mode == "keyword": return await self.index.query_keyword(query_string, k, score_threshold) + + # Calculate embeddings for both vector and hybrid modes + embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string]) + query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32) + if mode == "hybrid": + return await self.index.query_hybrid( + query_vector, query_string, k, score_threshold, reranker_type, reranker_params + ) else: - embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string]) - query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32) return await self.index.query_vector(query_vector, k, score_threshold) diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 36a120897..ebe0849f3 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -23,6 +23,8 @@ distribution_spec: - inline::basic - inline::llm-as-judge - inline::braintrust + files: + - inline::localfs post_training: - inline::huggingface tool_runtime: diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index 0b4f05128..46c4852a4 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import ( ShieldInput, ToolGroupInput, ) +from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig @@ -29,6 +30,7 @@ def get_distribution_template() -> DistributionTemplate: "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "files": ["inline::localfs"], "post_training": ["inline::huggingface"], "tool_runtime": [ "remote::brave-search", @@ -49,6 +51,11 @@ def get_distribution_template() -> DistributionTemplate: provider_type="inline::faiss", config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), ) + files_provider = Provider( + provider_id="meta-reference-files", + provider_type="inline::localfs", + config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) posttraining_provider = Provider( provider_id="huggingface", provider_type="inline::huggingface", @@ -98,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate: provider_overrides={ "inference": [inference_provider], "vector_io": [vector_io_provider_faiss], + "files": [files_provider], "post_training": [posttraining_provider], }, default_models=[inference_model, embedding_model], @@ -107,6 +115,7 @@ def get_distribution_template() -> DistributionTemplate: provider_overrides={ "inference": [inference_provider], "vector_io": [vector_io_provider_faiss], + "files": [files_provider], "post_training": [posttraining_provider], "safety": [ Provider( diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 7bf9fc3bd..85d5c813b 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -4,6 +4,7 @@ apis: - agents - datasetio - eval +- files - inference - post_training - safety @@ -84,6 +85,14 @@ providers: provider_type: inline::braintrust config: openai_api_key: ${env.OPENAI_API_KEY:} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db post_training: - provider_id: huggingface provider_type: inline::huggingface diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 0030bcd60..2d10a99a4 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -4,6 +4,7 @@ apis: - agents - datasetio - eval +- files - inference - post_training - safety @@ -82,6 +83,14 @@ providers: provider_type: inline::braintrust config: openai_api_key: ${env.OPENAI_API_KEY:} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db post_training: - provider_id: huggingface provider_type: inline::huggingface diff --git a/llama_stack/templates/starter/build.yaml b/llama_stack/templates/starter/build.yaml index 5fd3cc3f5..9bf4913a7 100644 --- a/llama_stack/templates/starter/build.yaml +++ b/llama_stack/templates/starter/build.yaml @@ -17,6 +17,8 @@ distribution_spec: - inline::sqlite-vec - remote::chromadb - remote::pgvector + files: + - inline::localfs safety: - inline::llama-guard agents: diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml index 4732afa77..319ababe5 100644 --- a/llama_stack/templates/starter/run.yaml +++ b/llama_stack/templates/starter/run.yaml @@ -4,6 +4,7 @@ apis: - agents - datasetio - eval +- files - inference - safety - scoring @@ -75,6 +76,14 @@ providers: db: ${env.PGVECTOR_DB:} user: ${env.PGVECTOR_USER:} password: ${env.PGVECTOR_PASSWORD:} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/starter/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/files_metadata.db safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py index 650ecc87f..2a44a0a37 100644 --- a/llama_stack/templates/starter/starter.py +++ b/llama_stack/templates/starter/starter.py @@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import ( ShieldInput, ToolGroupInput, ) +from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) @@ -134,6 +135,7 @@ def get_distribution_template() -> DistributionTemplate: providers = { "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]), "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], + "files": ["inline::localfs"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -170,6 +172,11 @@ def get_distribution_template() -> DistributionTemplate: ), ), ] + files_provider = Provider( + provider_id="meta-reference-files", + provider_type="inline::localfs", + config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) embedding_provider = Provider( provider_id="sentence-transformers", provider_type="inline::sentence-transformers", @@ -212,6 +219,7 @@ def get_distribution_template() -> DistributionTemplate: provider_overrides={ "inference": inference_providers + [embedding_provider], "vector_io": vector_io_providers, + "files": [files_provider], }, default_models=default_models + [embedding_model], default_tool_groups=default_tool_groups, diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 461527d18..3e43af272 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -22,9 +22,6 @@ def provider_from_model(client_with_models, model_id): def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id): - if isinstance(client_with_models, LlamaStackAsLibraryClient): - pytest.skip("OpenAI completions are not supported when testing with library client yet.") - provider = provider_from_model(client_with_models, model_id) if provider.provider_type in ( "inline::meta-reference", @@ -44,6 +41,23 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id) pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") +def skip_if_model_doesnt_support_suffix(client_with_models, model_id): + # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix. + # Use this to specifically test this API functionality. + + # pytest -sv --stack-config="inference=ollama" \ + # tests/integration/inference/test_openai_completion.py \ + # --text-model qwen2.5-coder:1.5b \ + # -k test_openai_completion_non_streaming_suffix + + if model_id != "qwen2.5-coder:1.5b": + pytest.skip(f"Suffix is not supported for the model: {model_id}.") + + provider = provider_from_model(client_with_models, model_id) + if provider.provider_type != "remote::ollama": + pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.") + + def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id): if isinstance(client_with_models, LlamaStackAsLibraryClient): pytest.skip("OpenAI chat completions are not supported when testing with library client yet.") @@ -102,6 +116,32 @@ def test_openai_completion_non_streaming(llama_stack_client, client_with_models, assert len(choice.text) > 10 +@pytest.mark.parametrize( + "test_case", + [ + "inference:completion:suffix", + ], +) +def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_models, text_model_id, test_case): + skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id) + skip_if_model_doesnt_support_suffix(client_with_models, text_model_id) + tc = TestCase(test_case) + + # ollama needs more verbose prompting for some reason here... + response = llama_stack_client.completions.create( + model=text_model_id, + prompt=tc["content"], + stream=False, + suffix=tc["suffix"], + max_tokens=10, + ) + + assert len(response.choices) > 0 + choice = response.choices[0] + assert len(choice.text) > 5 + assert "france" in choice.text.lower() + + @pytest.mark.parametrize( "test_case", [ diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py index 90a91a206..1b8bd9038 100644 --- a/tests/integration/inference/test_openai_embeddings.py +++ b/tests/integration/inference/test_openai_embeddings.py @@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id): "remote::runpod", "remote::sambanova", "remote::tgi", - "remote::ollama", ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.") diff --git a/tests/integration/test_cases/inference/completion.json b/tests/integration/test_cases/inference/completion.json index 731ceddbc..baaecb375 100644 --- a/tests/integration/test_cases/inference/completion.json +++ b/tests/integration/test_cases/inference/completion.json @@ -4,6 +4,12 @@ "content": "Complete the sentence using one word: Roses are red, violets are " } }, + "suffix": { + "data": { + "content": "The capital of ", + "suffix": "is Paris." + } + }, "non_streaming": { "data": { "content": "Micheael Jordan is born in ", diff --git a/tests/unit/providers/vector_io/test_sqlite_vec.py b/tests/unit/providers/vector_io/test_sqlite_vec.py index 010a0ca42..6424b9e86 100644 --- a/tests/unit/providers/vector_io/test_sqlite_vec.py +++ b/tests/unit/providers/vector_io/test_sqlite_vec.py @@ -84,6 +84,28 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}" +@pytest.mark.asyncio +async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings): + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Create a query embedding that's similar to the first chunk + query_embedding = sample_embeddings[0] + query_string = "Sentence 5" + + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + assert len(response.chunks) == 3, f"Expected 3 results, got {len(response.chunks)}" + # Verify scores are in descending order (higher is better) + assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) + + @pytest.mark.asyncio async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings): # Re-initialize with a clean index @@ -141,3 +163,355 @@ def test_generate_chunk_id(): "bc744db3-1b25-0a9c-cdff-b6ba3df73c36", "f68df25d-d9aa-ab4d-5684-64a233add20d", ] + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings): + """Test hybrid search when keyword search returns no matches - should still return vector results.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Use a non-existent keyword but a valid vector query + query_embedding = sample_embeddings[0] + query_string = "Sentence 499" + + # First verify keyword search returns no results + keyword_response = await sqlite_vec_index.query_keyword(query_string, k=5, score_threshold=0.0) + assert len(keyword_response.chunks) == 0, "Keyword search should return no results" + + # Get hybrid results + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + # Should still get results from vector search + assert len(response.chunks) > 0, "Should get results from vector search even with no keyword matches" + # Verify scores are in descending order + assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings): + """Test hybrid search with a high score threshold.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Use a very high score threshold that no results will meet + query_embedding = sample_embeddings[0] + query_string = "Sentence 5" + + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=1000.0, # Very high threshold + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + # Should return no results due to high threshold + assert len(response.chunks) == 0 + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_different_embedding( + sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension +): + """Test hybrid search with a different embedding than the stored ones.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Create a random embedding that's different from stored ones + query_embedding = np.random.rand(embedding_dimension).astype(np.float32) + query_string = "Sentence 5" + + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + # Should still get results if keyword matches exist + assert len(response.chunks) > 0 + # Verify scores are in descending order + assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings): + """Test that RRF properly combines rankings when documents appear in both search methods.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Create a query embedding that's similar to the first chunk + query_embedding = sample_embeddings[0] + # Use a keyword that appears in multiple documents + query_string = "Sentence 5" + + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=5, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + # Verify we get results from both search methods + assert len(response.chunks) > 0 + # Verify scores are in descending order (RRF should maintain this) + assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings): + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Create a query embedding that's similar to the first chunk + query_embedding = sample_embeddings[0] + # Use a keyword that appears in the first document + query_string = "Sentence 0 from document 0" + + # Test weighted re-ranking + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="weighted", + reranker_params={"alpha": 0.5}, + ) + assert len(response.chunks) == 1 + # Score should be weighted average of normalized keyword score and vector score + assert response.scores[0] > 0.5 # Both scores should be high + + # Test RRF re-ranking + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + assert len(response.chunks) == 1 + # RRF score should be sum of reciprocal ranks + assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # 1/(60+1) + 1/(60+1) + + # Test default re-ranking (should be RRF) + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + assert len(response.chunks) == 1 + assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # Should behave like RRF + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings): + """Test hybrid search with documents that appear in only one search method.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # Create a query embedding that's similar to the first chunk + query_embedding = sample_embeddings[0] + # Use a keyword that appears in a different document + query_string = "Sentence 9 from document 2" + + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + + # Should get results from both search methods + assert len(response.chunks) > 0 + # Verify scores are in descending order + assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1)) + # Verify we get results from both the vector-similar document and keyword-matched document + doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks} + assert "document-0" in doc_ids # From vector search + assert "document-2" in doc_ids # From keyword search + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_weighted_reranker_parametrization( + sqlite_vec_index, sample_chunks, sample_embeddings +): + """Test WeightedReRanker with different alpha values.""" + # Re-add data before each search to ensure test isolation + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + query_embedding = sample_embeddings[0] + query_string = "Sentence 0 from document 0" + + # alpha=1.0 (should behave like pure keyword) + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="weighted", + reranker_params={"alpha": 1.0}, + ) + assert len(response.chunks) > 0 # Should get at least one result + assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) + + # alpha=0.0 (should behave like pure vector) + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="weighted", + reranker_params={"alpha": 0.0}, + ) + assert len(response.chunks) > 0 # Should get at least one result + assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) + + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + # alpha=0.7 (should be a mix) + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="weighted", + reranker_params={"alpha": 0.7}, + ) + assert len(response.chunks) > 0 # Should get at least one result + assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks) + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings): + """Test RRFReRanker with different impact factors.""" + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + query_embedding = sample_embeddings[0] + query_string = "Sentence 0 from document 0" + + # impact_factor=10 + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 10.0}, + ) + assert len(response.chunks) == 1 + assert response.scores[0] == pytest.approx(2.0 / 11.0, rel=1e-6) + + # impact_factor=100 + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=1, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 100.0}, + ) + assert len(response.chunks) == 1 + assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6) + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings): + await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings) + + # No results from either search - use a completely different embedding and a nonzero threshold + query_embedding = np.ones_like(sample_embeddings[0]) * -1 # Very different from sample embeddings + query_string = "no_such_keyword_that_will_never_match" + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=0.1, # Nonzero threshold to filter out low-similarity matches + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + assert len(response.chunks) == 0 + + # All results below threshold + query_embedding = sample_embeddings[0] + query_string = "Sentence 0 from document 0" + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=3, + score_threshold=1000.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + assert len(response.chunks) == 0 + + # Large k value + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=100, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + # Should not error, should return all available results + assert len(response.chunks) > 0 + assert len(response.chunks) <= 100 + + +@pytest.mark.asyncio +async def test_query_chunks_hybrid_tie_breaking( + sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory +): + """Test tie-breaking and determinism when scores are equal.""" + # Create two chunks with the same content and embedding + chunk1 = Chunk(content="identical", metadata={"document_id": "docA"}) + chunk2 = Chunk(content="identical", metadata={"document_id": "docB"}) + chunks = [chunk1, chunk2] + # Use the same embedding for both chunks to ensure equal scores + same_embedding = sample_embeddings[0] + embeddings = np.array([same_embedding, same_embedding]) + + # Clear existing data and recreate index + await sqlite_vec_index.delete() + temp_dir = tmp_path_factory.getbasetemp() + db_path = str(temp_dir / "test_sqlite.db") + sqlite_vec_index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank") + await sqlite_vec_index.add_chunks(chunks, embeddings) + + # Query with the same embedding and content to ensure equal scores + query_embedding = same_embedding + query_string = "identical" + + # Run multiple queries to verify determinism + responses = [] + for _ in range(3): + response = await sqlite_vec_index.query_hybrid( + embedding=query_embedding, + query_string=query_string, + k=2, + score_threshold=0.0, + reranker_type="rrf", + reranker_params={"impact_factor": 60.0}, + ) + responses.append(response) + + # Verify all responses are identical + first_response = responses[0] + for response in responses[1:]: + assert response.chunks == first_response.chunks + assert response.scores == first_response.scores + + # Verify both chunks are returned with equal scores + assert len(first_response.chunks) == 2 + assert first_response.scores[0] == first_response.scores[1] + assert {chunk.metadata["document_id"] for chunk in first_response.chunks} == {"docA", "docB"} diff --git a/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf b/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf new file mode 100644 index 000000000..25579f425 Binary files /dev/null and b/tests/verifications/openai_api/fixtures/pdfs/llama_stack_and_models.pdf differ diff --git a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml index 4d6c19b59..1acf06388 100644 --- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml +++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml @@ -31,6 +31,25 @@ test_response_web_search: search_context_size: "low" output: "128" +test_response_file_search: + test_name: test_response_file_search + test_params: + case: + - case_id: "llama_experts" + input: "How many experts does the Llama 4 Maverick model have?" + tools: + - type: file_search + # vector_store_ids param for file_search tool gets added by the test runner + file_content: "Llama 4 Maverick has 128 experts" + output: "128" + - case_id: "llama_experts_pdf" + input: "How many experts does the Llama 4 Maverick model have?" + tools: + - type: file_search + # vector_store_ids param for file_search toolgets added by the test runner + file_path: "pdfs/llama_stack_and_models.pdf" + output: "128" + test_response_mcp_tool: test_name: test_response_mcp_tool test_params: diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py index 28020d3b1..1c9cdaa3a 100644 --- a/tests/verifications/openai_api/test_responses.py +++ b/tests/verifications/openai_api/test_responses.py @@ -5,6 +5,8 @@ # the root directory of this source tree. import json +import os +import time import httpx import openai @@ -23,6 +25,31 @@ from tests.verifications.openai_api.fixtures.load import load_test_cases responses_test_cases = load_test_cases("responses") +def _new_vector_store(openai_client, name): + # Ensure we don't reuse an existing vector store + vector_stores = openai_client.vector_stores.list() + for vector_store in vector_stores: + if vector_store.name == name: + openai_client.vector_stores.delete(vector_store_id=vector_store.id) + + # Create a new vector store + vector_store = openai_client.vector_stores.create( + name=name, + ) + return vector_store + + +def _upload_file(openai_client, name, file_path): + # Ensure we don't reuse an existing file + files = openai_client.files.list() + for file in files: + if file.filename == name: + openai_client.files.delete(file_id=file.id) + + # Upload a text file with our document content + return openai_client.files.create(file=open(file_path, "rb"), purpose="assistants") + + @pytest.mark.parametrize( "case", responses_test_cases["test_response_basic"]["test_params"]["case"], @@ -258,6 +285,111 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid assert case["output"].lower() in response.output_text.lower().strip() +@pytest.mark.parametrize( + "case", + responses_test_cases["test_response_file_search"]["test_params"]["case"], + ids=case_id_generator, +) +def test_response_non_streaming_file_search( + request, openai_client, model, provider, verification_config, tmp_path, case +): + if isinstance(openai_client, LlamaStackAsLibraryClient): + pytest.skip("Responses API file search is not yet supported in library client.") + + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + vector_store = _new_vector_store(openai_client, "test_vector_store") + + if "file_content" in case: + file_name = "test_response_non_streaming_file_search.txt" + file_path = tmp_path / file_name + file_path.write_text(case["file_content"]) + elif "file_path" in case: + file_path = os.path.join(os.path.dirname(__file__), "fixtures", case["file_path"]) + file_name = os.path.basename(file_path) + else: + raise ValueError(f"No file content or path provided for case {case['case_id']}") + + file_response = _upload_file(openai_client, file_name, file_path) + + # Attach our file to the vector store + file_attach_response = openai_client.vector_stores.files.create( + vector_store_id=vector_store.id, + file_id=file_response.id, + ) + + # Wait for the file to be attached + while file_attach_response.status == "in_progress": + time.sleep(0.1) + file_attach_response = openai_client.vector_stores.files.retrieve( + vector_store_id=vector_store.id, + file_id=file_response.id, + ) + assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}" + assert not file_attach_response.last_error + + # Update our tools with the right vector store id + tools = case["tools"] + for tool in tools: + if tool["type"] == "file_search": + tool["vector_store_ids"] = [vector_store.id] + + # Create the response request, which should query our vector store + response = openai_client.responses.create( + model=model, + input=case["input"], + tools=tools, + stream=False, + include=["file_search_call.results"], + ) + + # Verify the file_search_tool was called + assert len(response.output) > 1 + assert response.output[0].type == "file_search_call" + assert response.output[0].status == "completed" + assert response.output[0].queries # ensure it's some non-empty list + assert response.output[0].results + assert case["output"].lower() in response.output[0].results[0].text.lower() + assert response.output[0].results[0].score > 0 + + # Verify the output_text generated by the response + assert case["output"].lower() in response.output_text.lower().strip() + + +def test_response_non_streaming_file_search_empty_vector_store( + request, openai_client, model, provider, verification_config +): + if isinstance(openai_client, LlamaStackAsLibraryClient): + pytest.skip("Responses API file search is not yet supported in library client.") + + test_name_base = get_base_test_name(request) + if should_skip_test(verification_config, provider, model, test_name_base): + pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.") + + vector_store = _new_vector_store(openai_client, "test_vector_store") + + # Create the response request, which should query our vector store + response = openai_client.responses.create( + model=model, + input="How many experts does the Llama 4 Maverick model have?", + tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}], + stream=False, + include=["file_search_call.results"], + ) + + # Verify the file_search_tool was called + assert len(response.output) > 1 + assert response.output[0].type == "file_search_call" + assert response.output[0].status == "completed" + assert response.output[0].queries # ensure it's some non-empty list + assert not response.output[0].results # ensure we don't get any results + + # Verify some output_text was generated by the response + assert response.output_text + + @pytest.mark.parametrize( "case", responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],