Merge branch 'main' into watsonx_hc

This commit is contained in:
Sumit Jaiswal 2025-06-16 14:44:46 +05:30 committed by GitHub
commit f5388e252d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 2179 additions and 66 deletions

View file

@ -3240,6 +3240,59 @@
} }
} }
}, },
"/v1/openai/v1/vector_stores/{vector_store_id}/files": {
"post": {
"responses": {
"200": {
"description": "A VectorStoreFileObject representing the attached file.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/VectorStoreFileObject"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"VectorIO"
],
"description": "Attach a file to a vector store.",
"parameters": [
{
"name": "vector_store_id",
"in": "path",
"description": "The ID of the vector store to attach the file to.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenaiAttachFileToVectorStoreRequest"
}
}
},
"required": true
}
}
},
"/v1/openai/v1/completions": { "/v1/openai/v1/completions": {
"post": { "post": {
"responses": { "responses": {
@ -7047,6 +7100,9 @@
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
}, },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
},
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
}, },
@ -7193,12 +7249,41 @@
"const": "file_search", "const": "file_search",
"default": "file_search" "default": "file_search"
}, },
"vector_store_id": { "vector_store_ids": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
} }
}, },
"filters": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"max_num_results": {
"type": "integer",
"default": 10
},
"ranking_options": { "ranking_options": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7217,7 +7302,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type", "type",
"vector_store_id" "vector_store_ids"
], ],
"title": "OpenAIResponseInputToolFileSearch" "title": "OpenAIResponseInputToolFileSearch"
}, },
@ -7484,6 +7569,64 @@
], ],
"title": "OpenAIResponseOutputMessageContentOutputText" "title": "OpenAIResponseOutputMessageContentOutputText"
}, },
"OpenAIResponseOutputMessageFileSearchToolCall": {
"type": "object",
"properties": {
"id": {
"type": "string"
},
"queries": {
"type": "array",
"items": {
"type": "string"
}
},
"status": {
"type": "string"
},
"type": {
"type": "string",
"const": "file_search_call",
"default": "file_search_call"
},
"results": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
}
},
"additionalProperties": false,
"required": [
"id",
"queries",
"status",
"type"
],
"title": "OpenAIResponseOutputMessageFileSearchToolCall"
},
"OpenAIResponseOutputMessageFunctionToolCall": { "OpenAIResponseOutputMessageFunctionToolCall": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7760,6 +7903,9 @@
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
}, },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
},
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
}, },
@ -7775,6 +7921,7 @@
"mapping": { "mapping": {
"message": "#/components/schemas/OpenAIResponseMessage", "message": "#/components/schemas/OpenAIResponseMessage",
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall", "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
"file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall", "function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
@ -11766,6 +11913,232 @@
], ],
"title": "LogEventRequest" "title": "LogEventRequest"
}, },
"VectorStoreChunkingStrategy": {
"oneOf": [
{
"$ref": "#/components/schemas/VectorStoreChunkingStrategyAuto"
},
{
"$ref": "#/components/schemas/VectorStoreChunkingStrategyStatic"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"auto": "#/components/schemas/VectorStoreChunkingStrategyAuto",
"static": "#/components/schemas/VectorStoreChunkingStrategyStatic"
}
}
},
"VectorStoreChunkingStrategyAuto": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "auto",
"default": "auto"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "VectorStoreChunkingStrategyAuto"
},
"VectorStoreChunkingStrategyStatic": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "static",
"default": "static"
},
"static": {
"$ref": "#/components/schemas/VectorStoreChunkingStrategyStaticConfig"
}
},
"additionalProperties": false,
"required": [
"type",
"static"
],
"title": "VectorStoreChunkingStrategyStatic"
},
"VectorStoreChunkingStrategyStaticConfig": {
"type": "object",
"properties": {
"chunk_overlap_tokens": {
"type": "integer",
"default": 400
},
"max_chunk_size_tokens": {
"type": "integer",
"default": 800
}
},
"additionalProperties": false,
"required": [
"chunk_overlap_tokens",
"max_chunk_size_tokens"
],
"title": "VectorStoreChunkingStrategyStaticConfig"
},
"OpenaiAttachFileToVectorStoreRequest": {
"type": "object",
"properties": {
"file_id": {
"type": "string",
"description": "The ID of the file to attach to the vector store."
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "The key-value attributes stored with the file, which can be used for filtering."
},
"chunking_strategy": {
"$ref": "#/components/schemas/VectorStoreChunkingStrategy",
"description": "The chunking strategy to use for the file."
}
},
"additionalProperties": false,
"required": [
"file_id"
],
"title": "OpenaiAttachFileToVectorStoreRequest"
},
"VectorStoreFileLastError": {
"type": "object",
"properties": {
"code": {
"oneOf": [
{
"type": "string",
"const": "server_error"
},
{
"type": "string",
"const": "rate_limit_exceeded"
}
]
},
"message": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"code",
"message"
],
"title": "VectorStoreFileLastError"
},
"VectorStoreFileObject": {
"type": "object",
"properties": {
"id": {
"type": "string"
},
"object": {
"type": "string",
"default": "vector_store.file"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"chunking_strategy": {
"$ref": "#/components/schemas/VectorStoreChunkingStrategy"
},
"created_at": {
"type": "integer"
},
"last_error": {
"$ref": "#/components/schemas/VectorStoreFileLastError"
},
"status": {
"oneOf": [
{
"type": "string",
"const": "completed"
},
{
"type": "string",
"const": "in_progress"
},
{
"type": "string",
"const": "cancelled"
},
{
"type": "string",
"const": "failed"
}
]
},
"usage_bytes": {
"type": "integer",
"default": 0
},
"vector_store_id": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"id",
"object",
"attributes",
"chunking_strategy",
"created_at",
"status",
"usage_bytes",
"vector_store_id"
],
"title": "VectorStoreFileObject",
"description": "OpenAI Vector Store File object."
},
"OpenAIJSONSchema": { "OpenAIJSONSchema": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -12404,6 +12777,10 @@
}, },
"prompt_logprobs": { "prompt_logprobs": {
"type": "integer" "type": "integer"
},
"suffix": {
"type": "string",
"description": "(Optional) The suffix that should be appended to the completion."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -13621,7 +13998,11 @@
}, },
"mode": { "mode": {
"type": "string", "type": "string",
"description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"." "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
},
"ranker": {
"$ref": "#/components/schemas/Ranker",
"description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -13651,6 +14032,69 @@
} }
} }
}, },
"RRFRanker": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "rrf",
"default": "rrf",
"description": "The type of ranker, always \"rrf\""
},
"impact_factor": {
"type": "number",
"default": 60.0,
"description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009)."
}
},
"additionalProperties": false,
"required": [
"type",
"impact_factor"
],
"title": "RRFRanker",
"description": "Reciprocal Rank Fusion (RRF) ranker configuration."
},
"Ranker": {
"oneOf": [
{
"$ref": "#/components/schemas/RRFRanker"
},
{
"$ref": "#/components/schemas/WeightedRanker"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"rrf": "#/components/schemas/RRFRanker",
"weighted": "#/components/schemas/WeightedRanker"
}
}
},
"WeightedRanker": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "weighted",
"default": "weighted",
"description": "The type of ranker, always \"weighted\""
},
"alpha": {
"type": "number",
"default": 0.5,
"description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
}
},
"additionalProperties": false,
"required": [
"type",
"alpha"
],
"title": "WeightedRanker",
"description": "Weighted ranker configuration that combines vector and keyword scores."
},
"QueryRequest": { "QueryRequest": {
"type": "object", "type": "object",
"properties": { "properties": {

View file

@ -2263,6 +2263,43 @@ paths:
schema: schema:
$ref: '#/components/schemas/LogEventRequest' $ref: '#/components/schemas/LogEventRequest'
required: true required: true
/v1/openai/v1/vector_stores/{vector_store_id}/files:
post:
responses:
'200':
description: >-
A VectorStoreFileObject representing the attached file.
content:
application/json:
schema:
$ref: '#/components/schemas/VectorStoreFileObject'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- VectorIO
description: Attach a file to a vector store.
parameters:
- name: vector_store_id
in: path
description: >-
The ID of the vector store to attach the file to.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/OpenaiAttachFileToVectorStoreRequest'
required: true
/v1/openai/v1/completions: /v1/openai/v1/completions:
post: post:
responses: responses:
@ -5021,6 +5058,7 @@ components:
OpenAIResponseInput: OpenAIResponseInput:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseMessage'
@ -5115,10 +5153,23 @@ components:
type: string type: string
const: file_search const: file_search
default: file_search default: file_search
vector_store_id: vector_store_ids:
type: array type: array
items: items:
type: string type: string
filters:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
max_num_results:
type: integer
default: 10
ranking_options: ranking_options:
type: object type: object
properties: properties:
@ -5132,7 +5183,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- type - type
- vector_store_id - vector_store_ids
title: OpenAIResponseInputToolFileSearch title: OpenAIResponseInputToolFileSearch
OpenAIResponseInputToolFunction: OpenAIResponseInputToolFunction:
type: object type: object
@ -5294,6 +5345,41 @@ components:
- type - type
title: >- title: >-
OpenAIResponseOutputMessageContentOutputText OpenAIResponseOutputMessageContentOutputText
"OpenAIResponseOutputMessageFileSearchToolCall":
type: object
properties:
id:
type: string
queries:
type: array
items:
type: string
status:
type: string
type:
type: string
const: file_search_call
default: file_search_call
results:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
additionalProperties: false
required:
- id
- queries
- status
- type
title: >-
OpenAIResponseOutputMessageFileSearchToolCall
"OpenAIResponseOutputMessageFunctionToolCall": "OpenAIResponseOutputMessageFunctionToolCall":
type: object type: object
properties: properties:
@ -5491,6 +5577,7 @@ components:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseMessage'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@ -5499,6 +5586,7 @@ components:
mapping: mapping:
message: '#/components/schemas/OpenAIResponseMessage' message: '#/components/schemas/OpenAIResponseMessage'
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall' function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@ -8251,6 +8339,148 @@ components:
- event - event
- ttl_seconds - ttl_seconds
title: LogEventRequest title: LogEventRequest
VectorStoreChunkingStrategy:
oneOf:
- $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
- $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
discriminator:
propertyName: type
mapping:
auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
VectorStoreChunkingStrategyAuto:
type: object
properties:
type:
type: string
const: auto
default: auto
additionalProperties: false
required:
- type
title: VectorStoreChunkingStrategyAuto
VectorStoreChunkingStrategyStatic:
type: object
properties:
type:
type: string
const: static
default: static
static:
$ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
additionalProperties: false
required:
- type
- static
title: VectorStoreChunkingStrategyStatic
VectorStoreChunkingStrategyStaticConfig:
type: object
properties:
chunk_overlap_tokens:
type: integer
default: 400
max_chunk_size_tokens:
type: integer
default: 800
additionalProperties: false
required:
- chunk_overlap_tokens
- max_chunk_size_tokens
title: VectorStoreChunkingStrategyStaticConfig
OpenaiAttachFileToVectorStoreRequest:
type: object
properties:
file_id:
type: string
description: >-
The ID of the file to attach to the vector store.
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
The key-value attributes stored with the file, which can be used for filtering.
chunking_strategy:
$ref: '#/components/schemas/VectorStoreChunkingStrategy'
description: >-
The chunking strategy to use for the file.
additionalProperties: false
required:
- file_id
title: OpenaiAttachFileToVectorStoreRequest
VectorStoreFileLastError:
type: object
properties:
code:
oneOf:
- type: string
const: server_error
- type: string
const: rate_limit_exceeded
message:
type: string
additionalProperties: false
required:
- code
- message
title: VectorStoreFileLastError
VectorStoreFileObject:
type: object
properties:
id:
type: string
object:
type: string
default: vector_store.file
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
chunking_strategy:
$ref: '#/components/schemas/VectorStoreChunkingStrategy'
created_at:
type: integer
last_error:
$ref: '#/components/schemas/VectorStoreFileLastError'
status:
oneOf:
- type: string
const: completed
- type: string
const: in_progress
- type: string
const: cancelled
- type: string
const: failed
usage_bytes:
type: integer
default: 0
vector_store_id:
type: string
additionalProperties: false
required:
- id
- object
- attributes
- chunking_strategy
- created_at
- status
- usage_bytes
- vector_store_id
title: VectorStoreFileObject
description: OpenAI Vector Store File object.
OpenAIJSONSchema: OpenAIJSONSchema:
type: object type: object
properties: properties:
@ -8673,6 +8903,10 @@ components:
type: string type: string
prompt_logprobs: prompt_logprobs:
type: integer type: integer
suffix:
type: string
description: >-
(Optional) The suffix that should be appended to the completion.
additionalProperties: false additionalProperties: false
required: required:
- model - model
@ -9526,7 +9760,13 @@ components:
mode: mode:
type: string type: string
description: >- description: >-
Search mode for retrieval—either "vector" or "keyword". Default "vector". Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
"vector".
ranker:
$ref: '#/components/schemas/Ranker'
description: >-
Configuration for the ranker to use in hybrid search. Defaults to RRF
ranker.
additionalProperties: false additionalProperties: false
required: required:
- query_generator_config - query_generator_config
@ -9545,6 +9785,58 @@ components:
mapping: mapping:
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig' default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig' llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
RRFRanker:
type: object
properties:
type:
type: string
const: rrf
default: rrf
description: The type of ranker, always "rrf"
impact_factor:
type: number
default: 60.0
description: >-
The impact factor for RRF scoring. Higher values give more weight to higher-ranked
results. Must be greater than 0. Default of 60 is from the original RRF
paper (Cormack et al., 2009).
additionalProperties: false
required:
- type
- impact_factor
title: RRFRanker
description: >-
Reciprocal Rank Fusion (RRF) ranker configuration.
Ranker:
oneOf:
- $ref: '#/components/schemas/RRFRanker'
- $ref: '#/components/schemas/WeightedRanker'
discriminator:
propertyName: type
mapping:
rrf: '#/components/schemas/RRFRanker'
weighted: '#/components/schemas/WeightedRanker'
WeightedRanker:
type: object
properties:
type:
type: string
const: weighted
default: weighted
description: The type of ranker, always "weighted"
alpha:
type: number
default: 0.5
description: >-
Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
only use vector scores, values in between blend both scores.
additionalProperties: false
required:
- type
- alpha
title: WeightedRanker
description: >-
Weighted ranker configuration that combines vector and keyword scores.
QueryRequest: QueryRequest:
type: object type: object
properties: properties:

View file

@ -18,6 +18,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| agents | `inline::meta-reference` | | agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| files | `inline::localfs` |
| inference | `remote::ollama` | | inference | `remote::ollama` |
| post_training | `inline::huggingface` | | post_training | `inline::huggingface` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |

View file

@ -66,25 +66,126 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
2. Configure your Llama Stack project to use SQLite-Vec. 2. Configure your Llama Stack project to use SQLite-Vec.
3. Start storing and querying vectors. 3. Start storing and querying vectors.
## Supported Search Modes The SQLite-vec provider supports three search modes:
The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes. 1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in 3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
`RAGQueryConfig`. For example:
Example with hybrid search:
```python ```python
from llama_stack.apis.tool_runtime.rag import RAGQueryConfig response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
)
query_config = RAGQueryConfig(max_chunks=6, mode="vector") # Using RRF ranker
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "rrf", "impact_factor": 60.0},
},
)
results = client.tool_runtime.rag_tool.query( # Using weighted ranker
vector_db_ids=[vector_db_id], response = await vector_io.query_chunks(
content="what is torchtune", vector_db_id="my_db",
query_config=query_config, query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
},
) )
``` ```
Example with explicit vector search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
)
```
Example with keyword search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
)
```
## Supported Search Modes
The SQLite vector store supports three search modes:
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
### Hybrid Search
Hybrid search combines the strengths of both vector and keyword search by:
- Computing vector similarity scores
- Computing keyword match scores
- Using a ranker to combine these scores
Two ranker types are supported:
1. **RRF (Reciprocal Rank Fusion)**:
- Combines ranks from both vector and keyword results
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
- Good for balancing between vector and keyword results
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
2. **Weighted**:
- Linearly combines normalized vector and keyword scores
- Uses an alpha parameter (0-1) to control the blend:
- alpha=0: Only use keyword scores
- alpha=1: Only use vector scores
- alpha=0.5: Equal weight to both (default)
Example using RAGQueryConfig with different search modes:
```python
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
# Vector search
config = RAGQueryConfig(mode="vector", max_chunks=5)
# Keyword search
config = RAGQueryConfig(mode="keyword", max_chunks=5)
# Hybrid search with custom RRF ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
)
# Hybrid search with weighted ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
)
# Hybrid search with default RRF ranker
config = RAGQueryConfig(
mode="hybrid", max_chunks=5
) # Will use RRF with impact_factor=60.0
```
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
## Installation ## Installation
You can install SQLite-Vec using pip: You can install SQLite-Vec using pip:
@ -96,3 +197,5 @@ pip install sqlite-vec
## Documentation ## Documentation
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general. See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).

View file

@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
type: Literal["web_search_call"] = "web_search_call" type: Literal["web_search_call"] = "web_search_call"
@json_schema_type
class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
id: str
queries: list[str]
status: str
type: Literal["file_search_call"] = "file_search_call"
results: list[dict[str, Any]] | None = None
@json_schema_type @json_schema_type
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel): class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
call_id: str call_id: str
@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel):
OpenAIResponseOutput = Annotated[ OpenAIResponseOutput = Annotated[
OpenAIResponseMessage OpenAIResponseMessage
| OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageWebSearchToolCall
| OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseOutputMessageMCPCall | OpenAIResponseOutputMessageMCPCall
| OpenAIResponseOutputMessageMCPListTools, | OpenAIResponseOutputMessageMCPListTools,
@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
OpenAIResponseInput = Annotated[ OpenAIResponseInput = Annotated[
# Responses API allows output messages to be passed in as input # Responses API allows output messages to be passed in as input
OpenAIResponseOutputMessageWebSearchToolCall OpenAIResponseOutputMessageWebSearchToolCall
| OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall | OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseInputFunctionToolCallOutput | OpenAIResponseInputFunctionToolCallOutput
| |
@ -397,9 +408,10 @@ class FileSearchRankingOptions(BaseModel):
@json_schema_type @json_schema_type
class OpenAIResponseInputToolFileSearch(BaseModel): class OpenAIResponseInputToolFileSearch(BaseModel):
type: Literal["file_search"] = "file_search" type: Literal["file_search"] = "file_search"
vector_store_id: list[str] vector_store_ids: list[str]
filters: dict[str, Any] | None = None
max_num_results: int | None = Field(default=10, ge=1, le=50)
ranking_options: FileSearchRankingOptions | None = None ranking_options: FileSearchRankingOptions | None = None
# TODO: add filters
class ApprovalFilter(BaseModel): class ApprovalFilter(BaseModel):

View file

@ -1038,6 +1038,8 @@ class InferenceProvider(Protocol):
# vLLM-specific parameters # vLLM-specific parameters
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
# for fill-in-the-middle type completion
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model. """Generate an OpenAI-compatible completion for the given prompt using the specified model.
@ -1058,6 +1060,7 @@ class InferenceProvider(Protocol):
:param temperature: (Optional) The temperature to use. :param temperature: (Optional) The temperature to use.
:param top_p: (Optional) The top p to use. :param top_p: (Optional) The top p to use.
:param user: (Optional) The user to use. :param user: (Optional) The user to use.
:param suffix: (Optional) The suffix that should be appended to the completion.
:returns: An OpenAICompletion. :returns: An OpenAICompletion.
""" """
... ...

View file

@ -15,6 +15,48 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@json_schema_type
class RRFRanker(BaseModel):
"""
Reciprocal Rank Fusion (RRF) ranker configuration.
:param type: The type of ranker, always "rrf"
:param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009).
"""
type: Literal["rrf"] = "rrf"
impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance
@json_schema_type
class WeightedRanker(BaseModel):
"""
Weighted ranker configuration that combines vector and keyword scores.
:param type: The type of ranker, always "weighted"
:param alpha: Weight factor between 0 and 1.
0 means only use keyword scores,
1 means only use vector scores,
values in between blend both scores.
"""
type: Literal["weighted"] = "weighted"
alpha: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
)
Ranker = Annotated[
RRFRanker | WeightedRanker,
Field(discriminator="type"),
]
register_schema(Ranker, name="Ranker")
@json_schema_type @json_schema_type
class RAGDocument(BaseModel): class RAGDocument(BaseModel):
""" """
@ -76,7 +118,8 @@ class RAGQueryConfig(BaseModel):
:param chunk_template: Template for formatting each retrieved chunk in the context. :param chunk_template: Template for formatting each retrieved chunk in the context.
Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n" Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
:param mode: Search mode for retrievaleither "vector" or "keyword". Default "vector". :param mode: Search mode for retrievaleither "vector", "keyword", or "hybrid". Default "vector".
:param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
""" """
# This config defines how a query is generated using the messages # This config defines how a query is generated using the messages
@ -86,6 +129,7 @@ class RAGQueryConfig(BaseModel):
max_chunks: int = 5 max_chunks: int = 5
chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n" chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
mode: str | None = None mode: str | None = None
ranker: Ranker | None = Field(default=None) # Only used for hybrid mode
@field_validator("chunk_template") @field_validator("chunk_template")
def validate_chunk_template(cls, v: str) -> str: def validate_chunk_template(cls, v: str) -> str:

View file

@ -8,7 +8,7 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any, Literal, Protocol, runtime_checkable from typing import Annotated, Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -16,6 +16,7 @@ from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
from llama_stack.strong_typing.schema import register_schema
class Chunk(BaseModel): class Chunk(BaseModel):
@ -133,6 +134,50 @@ class VectorStoreDeleteResponse(BaseModel):
deleted: bool = True deleted: bool = True
@json_schema_type
class VectorStoreChunkingStrategyAuto(BaseModel):
type: Literal["auto"] = "auto"
@json_schema_type
class VectorStoreChunkingStrategyStaticConfig(BaseModel):
chunk_overlap_tokens: int = 400
max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
@json_schema_type
class VectorStoreChunkingStrategyStatic(BaseModel):
type: Literal["static"] = "static"
static: VectorStoreChunkingStrategyStaticConfig
VectorStoreChunkingStrategy = Annotated[
VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic, Field(discriminator="type")
]
register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
@json_schema_type
class VectorStoreFileLastError(BaseModel):
code: Literal["server_error"] | Literal["rate_limit_exceeded"]
message: str
@json_schema_type
class VectorStoreFileObject(BaseModel):
"""OpenAI Vector Store File object."""
id: str
object: str = "vector_store.file"
attributes: dict[str, Any] = Field(default_factory=dict)
chunking_strategy: VectorStoreChunkingStrategy
created_at: int
last_error: VectorStoreFileLastError | None = None
status: Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
usage_bytes: int = 0
vector_store_id: str
class VectorDBStore(Protocol): class VectorDBStore(Protocol):
def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ... def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...
@ -290,3 +335,21 @@ class VectorIO(Protocol):
:returns: A VectorStoreSearchResponse containing the search results. :returns: A VectorStoreSearchResponse containing the search results.
""" """
... ...
@webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST")
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
"""Attach a file to a vector store.
:param vector_store_id: The ID of the vector store to attach the file to.
:param file_id: The ID of the file to attach to the vector store.
:param attributes: The key-value attributes stored with the file, which can be used for filtering.
:param chunking_strategy: The chunking strategy to use for the file.
:returns: A VectorStoreFileObject representing the attached file.
"""
...

View file

@ -426,6 +426,7 @@ class InferenceRouter(Inference):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
logger.debug( logger.debug(
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
@ -456,6 +457,7 @@ class InferenceRouter(Inference):
user=user, user=user,
guided_choice=guided_choice, guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs, prompt_logprobs=prompt_logprobs,
suffix=suffix,
) )
provider = self.routing_table.get_provider_impl(model_obj.identifier) provider = self.routing_table.get_provider_impl(model_obj.identifier)

View file

@ -19,6 +19,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject, VectorStoreObject,
VectorStoreSearchResponsePage, VectorStoreSearchResponsePage,
) )
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable from llama_stack.providers.datatypes import RoutingTable
@ -254,3 +255,20 @@ class VectorIORouter(VectorIO):
ranking_options=ranking_options, ranking_options=ranking_options,
rewrite_query=rewrite_query, rewrite_query=rewrite_query,
) )
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
# Route based on vector store ID
provider = self.routing_table.get_provider_impl(vector_store_id)
return await provider.openai_attach_file_to_vector_store(
vector_store_id=vector_store_id,
file_id=file_id,
attributes=attributes,
chunking_strategy=chunking_strategy,
)

View file

@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInputMessageContentImage, OpenAIResponseInputMessageContentImage,
OpenAIResponseInputMessageContentText, OpenAIResponseInputMessageContentText,
OpenAIResponseInputTool, OpenAIResponseInputTool,
OpenAIResponseInputToolFileSearch,
OpenAIResponseInputToolMCP, OpenAIResponseInputToolMCP,
OpenAIResponseMessage, OpenAIResponseMessage,
OpenAIResponseObject, OpenAIResponseObject,
@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseOutput, OpenAIResponseOutput,
OpenAIResponseOutputMessageContent, OpenAIResponseOutputMessageContent,
OpenAIResponseOutputMessageContentOutputText, OpenAIResponseOutputMessageContentOutputText,
OpenAIResponseOutputMessageFileSearchToolCall,
OpenAIResponseOutputMessageFunctionToolCall, OpenAIResponseOutputMessageFunctionToolCall,
OpenAIResponseOutputMessageMCPListTools, OpenAIResponseOutputMessageMCPListTools,
OpenAIResponseOutputMessageWebSearchToolCall, OpenAIResponseOutputMessageWebSearchToolCall,
@ -62,7 +64,7 @@ from llama_stack.apis.inference.inference import (
OpenAIToolMessageParam, OpenAIToolMessageParam,
OpenAIUserMessageParam, OpenAIUserMessageParam,
) )
from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime from llama_stack.apis.tools import RAGQueryConfig, ToolGroups, ToolRuntime
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
class ChatCompletionContext(BaseModel): class ChatCompletionContext(BaseModel):
model: str model: str
messages: list[OpenAIMessageParam] messages: list[OpenAIMessageParam]
tools: list[ChatCompletionToolParam] | None = None response_tools: list[OpenAIResponseInputTool] | None = None
chat_tools: list[ChatCompletionToolParam] | None = None
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
temperature: float | None temperature: float | None
response_format: OpenAIResponseFormatParam response_format: OpenAIResponseFormatParam
@ -388,7 +391,8 @@ class OpenAIResponsesImpl:
ctx = ChatCompletionContext( ctx = ChatCompletionContext(
model=model, model=model,
messages=messages, messages=messages,
tools=chat_tools, response_tools=tools,
chat_tools=chat_tools,
mcp_tool_to_server=mcp_tool_to_server, mcp_tool_to_server=mcp_tool_to_server,
temperature=temperature, temperature=temperature,
response_format=response_format, response_format=response_format,
@ -417,7 +421,7 @@ class OpenAIResponsesImpl:
completion_result = await self.inference_api.openai_chat_completion( completion_result = await self.inference_api.openai_chat_completion(
model=ctx.model, model=ctx.model,
messages=messages, messages=messages,
tools=ctx.tools, tools=ctx.chat_tools,
stream=True, stream=True,
temperature=ctx.temperature, temperature=ctx.temperature,
response_format=ctx.response_format, response_format=ctx.response_format,
@ -606,6 +610,12 @@ class OpenAIResponsesImpl:
if not tool: if not tool:
raise ValueError(f"Tool {tool_name} not found") raise ValueError(f"Tool {tool_name} not found")
chat_tools.append(make_openai_tool(tool_name, tool)) chat_tools.append(make_openai_tool(tool_name, tool))
elif input_tool.type == "file_search":
tool_name = "knowledge_search"
tool = await self.tool_groups_api.get_tool(tool_name)
if not tool:
raise ValueError(f"Tool {tool_name} not found")
chat_tools.append(make_openai_tool(tool_name, tool))
elif input_tool.type == "mcp": elif input_tool.type == "mcp":
always_allowed = None always_allowed = None
never_allowed = None never_allowed = None
@ -667,6 +677,7 @@ class OpenAIResponsesImpl:
tool_call_id = tool_call.id tool_call_id = tool_call.id
function = tool_call.function function = tool_call.function
tool_kwargs = json.loads(function.arguments) if function.arguments else {}
if not function or not tool_call_id or not function.name: if not function or not tool_call_id or not function.name:
return None, None return None, None
@ -680,12 +691,26 @@ class OpenAIResponsesImpl:
endpoint=mcp_tool.server_url, endpoint=mcp_tool.server_url,
headers=mcp_tool.headers or {}, headers=mcp_tool.headers or {},
tool_name=function.name, tool_name=function.name,
kwargs=json.loads(function.arguments) if function.arguments else {}, kwargs=tool_kwargs,
) )
else: else:
if function.name == "knowledge_search":
response_file_search_tool = next(
t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)
)
if response_file_search_tool:
if response_file_search_tool.filters:
logger.warning("Filters are not yet supported for file_search tool")
if response_file_search_tool.ranking_options:
logger.warning("Ranking options are not yet supported for file_search tool")
tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids
tool_kwargs["query_config"] = RAGQueryConfig(
mode="vector",
max_chunks=response_file_search_tool.max_num_results,
)
result = await self.tool_runtime_api.invoke_tool( result = await self.tool_runtime_api.invoke_tool(
tool_name=function.name, tool_name=function.name,
kwargs=json.loads(function.arguments) if function.arguments else {}, kwargs=tool_kwargs,
) )
except Exception as e: except Exception as e:
error_exc = e error_exc = e
@ -713,6 +738,27 @@ class OpenAIResponsesImpl:
) )
if error_exc or (result.error_code and result.error_code > 0) or result.error_message: if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
message.status = "failed" message.status = "failed"
elif function.name == "knowledge_search":
message = OpenAIResponseOutputMessageFileSearchToolCall(
id=tool_call_id,
queries=[tool_kwargs.get("query", "")],
status="completed",
)
if "document_ids" in result.metadata:
message.results = []
for i, doc_id in enumerate(result.metadata["document_ids"]):
text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
score = result.metadata["scores"][i] if "scores" in result.metadata else None
message.results.append(
{
"file_id": doc_id,
"filename": doc_id,
"text": text,
"score": score,
}
)
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
message.status = "failed"
else: else:
raise ValueError(f"Unknown tool {function.name} called") raise ValueError(f"Unknown tool {function.name} called")

View file

@ -121,8 +121,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
query=query, query=query,
params={ params={
"max_chunks": query_config.max_chunks,
"mode": query_config.mode, "mode": query_config.mode,
"max_chunks": query_config.max_chunks,
"score_threshold": 0.0,
"ranker": query_config.ranker,
}, },
) )
for vector_db_id in vector_db_ids for vector_db_id in vector_db_ids
@ -170,6 +172,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
content=picked, content=picked,
metadata={ metadata={
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]], "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
"chunks": [c.content for c in chunks[: len(picked)]],
"scores": scores[: len(picked)],
}, },
) )

View file

@ -16,6 +16,6 @@ async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]):
assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
impl = FaissVectorIOAdapter(config, deps[Api.inference]) impl = FaissVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -15,6 +15,7 @@ import faiss
import numpy as np import numpy as np
from numpy.typing import NDArray from numpy.typing import NDArray
from llama_stack.apis.files import Files
from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.inference.inference import Inference from llama_stack.apis.inference.inference import Inference
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
@ -130,11 +131,23 @@ class FaissIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in FAISS") raise NotImplementedError("Keyword search is not supported in FAISS")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in FAISS")
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None: def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
self.config = config self.config = config
self.inference_api = inference_api self.inference_api = inference_api
self.files_api = files_api
self.cache: dict[str, VectorDBWithIndex] = {} self.cache: dict[str, VectorDBWithIndex] = {}
self.kvstore: KVStore | None = None self.kvstore: KVStore | None = None
self.openai_vector_stores: dict[str, dict[str, Any]] = {} self.openai_vector_stores: dict[str, dict[str, Any]] = {}

View file

@ -15,6 +15,6 @@ async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]):
from .sqlite_vec import SQLiteVecVectorIOAdapter from .sqlite_vec import SQLiteVecVectorIOAdapter
assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference]) impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -17,6 +17,7 @@ import numpy as np
import sqlite_vec import sqlite_vec
from numpy.typing import NDArray from numpy.typing import NDArray
from llama_stack.apis.files.files import Files
from llama_stack.apis.inference.inference import Inference from llama_stack.apis.inference.inference import Inference
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import ( from llama_stack.apis.vector_io import (
@ -26,14 +27,20 @@ from llama_stack.apis.vector_io import (
) )
from llama_stack.providers.datatypes import VectorDBsProtocolPrivate from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex from llama_stack.providers.utils.memory.vector_store import (
RERANKER_TYPE_RRF,
RERANKER_TYPE_WEIGHTED,
EmbeddingIndex,
VectorDBWithIndex,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Specifying search mode is dependent on the VectorIO provider. # Specifying search mode is dependent on the VectorIO provider.
VECTOR_SEARCH = "vector" VECTOR_SEARCH = "vector"
KEYWORD_SEARCH = "keyword" KEYWORD_SEARCH = "keyword"
SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH} HYBRID_SEARCH = "hybrid"
SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH}
def serialize_vector(vector: list[float]) -> bytes: def serialize_vector(vector: list[float]) -> bytes:
@ -50,6 +57,59 @@ def _create_sqlite_connection(db_path):
return connection return connection
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
"""Normalize scores to [0,1] range using min-max normalization."""
if not scores:
return {}
min_score = min(scores.values())
max_score = max(scores.values())
score_range = max_score - min_score
if score_range > 0:
return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
return {doc_id: 1.0 for doc_id in scores}
def _weighted_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
alpha: float = 0.5,
) -> dict[str, float]:
"""ReRanker that uses weighted average of scores."""
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
normalized_vector_scores = _normalize_scores(vector_scores)
normalized_keyword_scores = _normalize_scores(keyword_scores)
return {
doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
+ ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
for doc_id in all_ids
}
def _rrf_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
impact_factor: float = 60.0,
) -> dict[str, float]:
"""ReRanker that uses Reciprocal Rank Fusion."""
# Convert scores to ranks
vector_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
}
keyword_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
}
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
rrf_scores = {}
for doc_id in all_ids:
vector_rank = vector_ranks.get(doc_id, float("inf"))
keyword_rank = keyword_ranks.get(doc_id, float("inf"))
# RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
return rrf_scores
class SQLiteVecIndex(EmbeddingIndex): class SQLiteVecIndex(EmbeddingIndex):
""" """
An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec. An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@ -254,8 +314,6 @@ class SQLiteVecIndex(EmbeddingIndex):
""" """
Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search. Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
""" """
if query_string is None:
raise ValueError("query_string is required for keyword search.")
def _execute_query(): def _execute_query():
connection = _create_sqlite_connection(self.db_path) connection = _create_sqlite_connection(self.db_path)
@ -293,6 +351,81 @@ class SQLiteVecIndex(EmbeddingIndex):
scores.append(score) scores.append(score)
return QueryChunksResponse(chunks=chunks, scores=scores) return QueryChunksResponse(chunks=chunks, scores=scores)
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str = RERANKER_TYPE_RRF,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
"""
Hybrid search using a configurable re-ranking strategy.
Args:
embedding: The query embedding vector
query_string: The text query for keyword search
k: Number of results to return
score_threshold: Minimum similarity score threshold
reranker_type: Type of reranker to use ("rrf" or "weighted")
reranker_params: Parameters for the reranker
Returns:
QueryChunksResponse with combined results
"""
if reranker_params is None:
reranker_params = {}
# Get results from both search methods
vector_response = await self.query_vector(embedding, k, score_threshold)
keyword_response = await self.query_keyword(query_string, k, score_threshold)
# Convert responses to score dictionaries using generate_chunk_id
vector_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
}
keyword_scores = {
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
}
# Combine scores using the specified reranker
if reranker_type == RERANKER_TYPE_WEIGHTED:
alpha = reranker_params.get("alpha", 0.5)
combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
else:
# Default to RRF for None, RRF, or any unknown types
impact_factor = reranker_params.get("impact_factor", 60.0)
combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
# Sort by combined score and get top k results
sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
top_k_items = sorted_items[:k]
# Filter by score threshold
filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
# Create a map of chunk_id to chunk for both responses
chunk_map = {}
for c in vector_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
chunk_map[chunk_id] = c
for c in keyword_response.chunks:
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
chunk_map[chunk_id] = c
# Use the map to look up chunks by their IDs
chunks = []
scores = []
for doc_id, score in filtered_items:
if doc_id in chunk_map:
chunks.append(chunk_map[doc_id])
scores.append(score)
return QueryChunksResponse(chunks=chunks, scores=scores)
class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
""" """
@ -301,9 +434,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex). and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
""" """
def __init__(self, config, inference_api: Inference) -> None: def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None:
self.config = config self.config = config
self.inference_api = inference_api self.inference_api = inference_api
self.files_api = files_api
self.cache: dict[str, VectorDBWithIndex] = {} self.cache: dict[str, VectorDBWithIndex] = {}
self.openai_vector_stores: dict[str, dict[str, Any]] = {} self.openai_vector_stores: dict[str, dict[str, Any]] = {}
@ -343,7 +477,9 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
vector_db_data = row[0] vector_db_data = row[0]
vector_db = VectorDB.model_validate_json(vector_db_data) vector_db = VectorDB.model_validate_json(vector_db_data)
index = await SQLiteVecIndex.create( index = await SQLiteVecIndex.create(
vector_db.embedding_dimension, self.config.db_path, vector_db.identifier vector_db.embedding_dimension,
self.config.db_path,
vector_db.identifier,
) )
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api) self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
@ -369,7 +505,11 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
connection.close() connection.close()
await asyncio.to_thread(_register_db) await asyncio.to_thread(_register_db)
index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier) index = await SQLiteVecIndex.create(
vector_db.embedding_dimension,
self.config.db_path,
vector_db.identifier,
)
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api) self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
async def list_vector_dbs(self) -> list[VectorDB]: async def list_vector_dbs(self) -> list[VectorDB]:

View file

@ -24,6 +24,7 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
deprecation_warning="Please use the `inline::faiss` provider instead.", deprecation_warning="Please use the `inline::faiss` provider instead.",
api_dependencies=[Api.inference], api_dependencies=[Api.inference],
optional_api_dependencies=[Api.files],
), ),
InlineProviderSpec( InlineProviderSpec(
api=Api.vector_io, api=Api.vector_io,
@ -32,6 +33,7 @@ def available_providers() -> list[ProviderSpec]:
module="llama_stack.providers.inline.vector_io.faiss", module="llama_stack.providers.inline.vector_io.faiss",
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
api_dependencies=[Api.inference], api_dependencies=[Api.inference],
optional_api_dependencies=[Api.files],
), ),
# NOTE: sqlite-vec cannot be bundled into the container image because it does not have a # NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
# source distribution and the wheels are not available for all platforms. # source distribution and the wheels are not available for all platforms.
@ -42,6 +44,7 @@ def available_providers() -> list[ProviderSpec]:
module="llama_stack.providers.inline.vector_io.sqlite_vec", module="llama_stack.providers.inline.vector_io.sqlite_vec",
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
api_dependencies=[Api.inference], api_dependencies=[Api.inference],
optional_api_dependencies=[Api.files],
), ),
InlineProviderSpec( InlineProviderSpec(
api=Api.vector_io, api=Api.vector_io,
@ -51,6 +54,7 @@ def available_providers() -> list[ProviderSpec]:
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.", deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
api_dependencies=[Api.inference], api_dependencies=[Api.inference],
optional_api_dependencies=[Api.files],
), ),
remote_provider_spec( remote_provider_spec(
Api.vector_io, Api.vector_io,

View file

@ -318,6 +318,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)

View file

@ -316,6 +316,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
provider_model_id = await self._get_provider_model_id(model) provider_model_id = await self._get_provider_model_id(model)

View file

@ -33,7 +33,6 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat, JsonSchemaResponseFormat,
LogProbConfig, LogProbConfig,
Message, Message,
OpenAIEmbeddingsResponse,
ResponseFormat, ResponseFormat,
SamplingParams, SamplingParams,
TextTruncation, TextTruncation,
@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAIChatCompletionChunk, OpenAIChatCompletionChunk,
OpenAICompletion, OpenAICompletion,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam, OpenAIMessageParam,
OpenAIResponseFormatParam, OpenAIResponseFormatParam,
) )
@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import (
from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_compat import (
OpenAICompatCompletionChoice, OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse, OpenAICompatCompletionResponse,
b64_encode_openai_embeddings_response,
get_sampling_options, get_sampling_options,
prepare_openai_completion_params, prepare_openai_completion_params,
prepare_openai_embeddings_params,
process_chat_completion_response, process_chat_completion_response,
process_chat_completion_stream_response, process_chat_completion_stream_response,
process_completion_response, process_completion_response,
@ -386,7 +389,35 @@ class OllamaInferenceAdapter(
dimensions: int | None = None, dimensions: int | None = None,
user: str | None = None, user: str | None = None,
) -> OpenAIEmbeddingsResponse: ) -> OpenAIEmbeddingsResponse:
raise NotImplementedError() model_obj = await self._get_model(model)
if model_obj.model_type != ModelType.embedding:
raise ValueError(f"Model {model} is not an embedding model")
if model_obj.provider_resource_id is None:
raise ValueError(f"Model {model} has no provider_resource_id set")
# Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters
params = prepare_openai_embeddings_params(
model=model_obj.provider_resource_id,
input=input,
encoding_format=encoding_format,
dimensions=dimensions,
user=user,
)
response = await self.openai_client.embeddings.create(**params)
data = b64_encode_openai_embeddings_response(response.data, encoding_format)
usage = OpenAIEmbeddingUsage(
prompt_tokens=response.usage.prompt_tokens,
total_tokens=response.usage.total_tokens,
)
# TODO: Investigate why model_obj.identifier is used instead of response.model
return OpenAIEmbeddingsResponse(
data=data,
model=model_obj.identifier,
usage=usage,
)
async def openai_completion( async def openai_completion(
self, self,
@ -409,6 +440,7 @@ class OllamaInferenceAdapter(
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
if not isinstance(prompt, str): if not isinstance(prompt, str):
raise ValueError("Ollama does not support non-string prompts for completion") raise ValueError("Ollama does not support non-string prompts for completion")
@ -432,6 +464,7 @@ class OllamaInferenceAdapter(
temperature=temperature, temperature=temperature,
top_p=top_p, top_p=top_p,
user=user, user=user,
suffix=suffix,
) )
return await self.openai_client.completions.create(**params) # type: ignore return await self.openai_client.completions.create(**params) # type: ignore

View file

@ -90,6 +90,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
if guided_choice is not None: if guided_choice is not None:
logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.") logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
@ -117,6 +118,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
temperature=temperature, temperature=temperature,
top_p=top_p, top_p=top_p,
user=user, user=user,
suffix=suffix,
) )
return await self._openai_client.completions.create(**params) return await self._openai_client.completions.create(**params)

View file

@ -242,6 +242,7 @@ class PassthroughInferenceAdapter(Inference):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
client = self._get_client() client = self._get_client()
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)

View file

@ -299,6 +299,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params( params = await prepare_openai_completion_params(

View file

@ -559,6 +559,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
self._lazy_initialize_client() self._lazy_initialize_client()
model_obj = await self._get_model(model) model_obj = await self._get_model(model)

View file

@ -313,6 +313,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params( params = await prepare_openai_completion_params(

View file

@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject, VectorStoreObject,
VectorStoreSearchResponsePage, VectorStoreSearchResponsePage,
) )
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
@ -104,6 +105,17 @@ class ChromaIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Chroma") raise NotImplementedError("Keyword search is not supported in Chroma")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in Chroma")
class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
def __init__( def __init__(
@ -241,3 +253,12 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
rewrite_query: bool | None = False, rewrite_query: bool | None = False,
) -> VectorStoreSearchResponsePage: ) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma") raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")

View file

@ -25,6 +25,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject, VectorStoreObject,
VectorStoreSearchResponsePage, VectorStoreSearchResponsePage,
) )
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
@ -102,6 +103,17 @@ class MilvusIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Milvus") raise NotImplementedError("Keyword search is not supported in Milvus")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in Milvus")
class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate): class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
def __init__( def __init__(
@ -240,6 +252,15 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
) -> VectorStoreSearchResponsePage: ) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant") raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")
def generate_chunk_id(document_id: str, chunk_text: str) -> str: def generate_chunk_id(document_id: str, chunk_text: str) -> str:
"""Generate a unique chunk ID using a hash of document ID and chunk text.""" """Generate a unique chunk ID using a hash of document ID and chunk text."""

View file

@ -128,6 +128,17 @@ class PGVectorIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in PGVector") raise NotImplementedError("Keyword search is not supported in PGVector")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in PGVector")
async def delete(self): async def delete(self):
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
cur.execute(f"DROP TABLE IF EXISTS {self.table_name}") cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")

View file

@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
VectorStoreObject, VectorStoreObject,
VectorStoreSearchResponsePage, VectorStoreSearchResponsePage,
) )
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
@ -111,6 +112,17 @@ class QdrantIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Qdrant") raise NotImplementedError("Keyword search is not supported in Qdrant")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in Qdrant")
async def delete(self): async def delete(self):
await self.client.delete_collection(collection_name=self.collection_name) await self.client.delete_collection(collection_name=self.collection_name)
@ -241,3 +253,12 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
rewrite_query: bool | None = False, rewrite_query: bool | None = False,
) -> VectorStoreSearchResponsePage: ) -> VectorStoreSearchResponsePage:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant") raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")

View file

@ -92,6 +92,17 @@ class WeaviateIndex(EmbeddingIndex):
) -> QueryChunksResponse: ) -> QueryChunksResponse:
raise NotImplementedError("Keyword search is not supported in Weaviate") raise NotImplementedError("Keyword search is not supported in Weaviate")
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError("Hybrid search is not supported in Weaviate")
class WeaviateVectorIOAdapter( class WeaviateVectorIOAdapter(
VectorIO, VectorIO,

View file

@ -4,8 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import base64
import struct
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any from typing import Any
@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAIChatCompletionChunk, OpenAIChatCompletionChunk,
OpenAICompletion, OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse, OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage, OpenAIEmbeddingUsage,
OpenAIMessageParam, OpenAIMessageParam,
@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_compat import (
b64_encode_openai_embeddings_response,
convert_message_to_openai_dict_new, convert_message_to_openai_dict_new,
convert_openai_chat_completion_choice, convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream, convert_openai_chat_completion_stream,
@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin(
) )
# Convert response to OpenAI format # Convert response to OpenAI format
data = [] data = b64_encode_openai_embeddings_response(response.data, encoding_format)
for i, embedding_data in enumerate(response["data"]):
# we encode to base64 if the encoding format is base64 in the request
if encoding_format == "base64":
byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
embedding = base64.b64encode(byte_data).decode("utf-8")
else:
embedding = embedding_data["embedding"]
data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
usage = OpenAIEmbeddingUsage( usage = OpenAIEmbeddingUsage(
prompt_tokens=response["usage"]["prompt_tokens"], prompt_tokens=response["usage"]["prompt_tokens"],
@ -336,6 +325,7 @@ class LiteLLMOpenAIMixin(
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params( params = await prepare_openai_completion_params(

View file

@ -3,8 +3,10 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import base64
import json import json
import logging import logging
import struct
import time import time
import uuid import uuid
import warnings import warnings
@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAICompletion, OpenAICompletion,
OpenAICompletionChoice, OpenAICompletionChoice,
OpenAIEmbeddingData,
OpenAIMessageParam, OpenAIMessageParam,
OpenAIResponseFormatParam, OpenAIResponseFormatParam,
ToolConfig, ToolConfig,
@ -1287,6 +1290,7 @@ class OpenAICompletionToLlamaStackMixin:
user: str | None = None, user: str | None = None,
guided_choice: list[str] | None = None, guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None, prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
if stream: if stream:
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions") raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
@ -1483,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin:
model=model, model=model,
object="chat.completion", object="chat.completion",
) )
def prepare_openai_embeddings_params(
model: str,
input: str | list[str],
encoding_format: str | None = "float",
dimensions: int | None = None,
user: str | None = None,
):
if model is None:
raise ValueError("Model must be provided for embeddings")
input_list = [input] if isinstance(input, str) else input
params: dict[str, Any] = {
"model": model,
"input": input_list,
}
if encoding_format is not None:
params["encoding_format"] = encoding_format
if dimensions is not None:
params["dimensions"] = dimensions
if user is not None:
params["user"] = user
return params
def b64_encode_openai_embeddings_response(
response_data: dict, encoding_format: str | None = "float"
) -> list[OpenAIEmbeddingData]:
"""
Process the OpenAI embeddings response to encode the embeddings in base64 format if specified.
"""
data = []
for i, embedding_data in enumerate(response_data):
if encoding_format == "base64":
byte_array = bytearray()
for embedding_value in embedding_data.embedding:
byte_array.extend(struct.pack("f", float(embedding_value)))
response_embedding = base64.b64encode(byte_array).decode("utf-8")
else:
response_embedding = embedding_data.embedding
data.append(
OpenAIEmbeddingData(
embedding=response_embedding,
index=i,
)
)
return data

View file

@ -5,11 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
import logging import logging
import mimetypes
import time import time
import uuid import uuid
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any from typing import Any
from llama_stack.apis.files import Files
from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import ( from llama_stack.apis.vector_io import (
QueryChunksResponse, QueryChunksResponse,
@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
VectorStoreSearchResponse, VectorStoreSearchResponse,
VectorStoreSearchResponsePage, VectorStoreSearchResponsePage,
) )
from llama_stack.apis.vector_io.vector_io import (
Chunk,
VectorStoreChunkingStrategy,
VectorStoreChunkingStrategyAuto,
VectorStoreChunkingStrategyStatic,
VectorStoreFileLastError,
VectorStoreFileObject,
)
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):
# These should be provided by the implementing class # These should be provided by the implementing class
openai_vector_stores: dict[str, dict[str, Any]] openai_vector_stores: dict[str, dict[str, Any]]
files_api: Files | None
@abstractmethod @abstractmethod
async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None: async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
"""Unregister a vector database (provider-specific implementation).""" """Unregister a vector database (provider-specific implementation)."""
pass pass
@abstractmethod
async def insert_chunks(
self,
vector_db_id: str,
chunks: list[Chunk],
ttl_seconds: int | None = None,
) -> None:
"""Insert chunks into a vector database (provider-specific implementation)."""
pass
@abstractmethod @abstractmethod
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC):
if metadata[key] != value: if metadata[key] != value:
return False return False
return True return True
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
attributes = attributes or {}
chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
vector_store_file_object = VectorStoreFileObject(
id=file_id,
attributes=attributes,
chunking_strategy=chunking_strategy,
created_at=int(time.time()),
status="in_progress",
vector_store_id=vector_store_id,
)
if not hasattr(self, "files_api") or not self.files_api:
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message="Files API is not available",
)
return vector_store_file_object
if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
else:
# Default values from OpenAI API spec
max_chunk_size_tokens = 800
chunk_overlap_tokens = 400
try:
file_response = await self.files_api.openai_retrieve_file(file_id)
mime_type, _ = mimetypes.guess_type(file_response.filename)
content_response = await self.files_api.openai_retrieve_file_content(file_id)
content = content_from_data_and_mime_type(content_response.body, mime_type)
chunks = make_overlapped_chunks(
file_id,
content,
max_chunk_size_tokens,
chunk_overlap_tokens,
attributes,
)
if not chunks:
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message="No chunks were generated from the file",
)
return vector_store_file_object
await self.insert_chunks(
vector_db_id=vector_store_id,
chunks=chunks,
)
except Exception as e:
logger.error(f"Error attaching file to vector store: {e}")
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message=str(e),
)
return vector_store_file_object
vector_store_file_object.status = "completed"
return vector_store_file_object

View file

@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Constants for reranker types
RERANKER_TYPE_RRF = "rrf"
RERANKER_TYPE_WEIGHTED = "weighted"
def parse_pdf(data: bytes) -> str: def parse_pdf(data: bytes) -> str:
# For PDF and DOC/DOCX files, we can't reliably convert to string # For PDF and DOC/DOCX files, we can't reliably convert to string
@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str:
data = unquote(data) data = unquote(data)
encoding = parts["encoding"] or "utf-8" encoding = parts["encoding"] or "utf-8"
data = data.encode(encoding) data = data.encode(encoding)
return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))
encoding = parts["encoding"]
if not encoding:
import chardet
detected = chardet.detect(data) def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
encoding = detected["encoding"] if isinstance(data, bytes):
if not encoding:
import chardet
mime_type = parts["mimetype"] detected = chardet.detect(data)
mime_category = mime_type.split("/")[0] encoding = detected["encoding"]
mime_category = mime_type.split("/")[0] if mime_type else None
if mime_category == "text": if mime_category == "text":
# For text-based files (including CSV, MD) # For text-based files (including CSV, MD)
return data.decode(encoding) return data.decode(encoding)
@ -200,6 +206,18 @@ class EmbeddingIndex(ABC):
async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse: async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
raise NotImplementedError() raise NotImplementedError()
@abstractmethod
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError()
@abstractmethod @abstractmethod
async def delete(self): async def delete(self):
raise NotImplementedError() raise NotImplementedError()
@ -243,10 +261,29 @@ class VectorDBWithIndex:
k = params.get("max_chunks", 3) k = params.get("max_chunks", 3)
mode = params.get("mode") mode = params.get("mode")
score_threshold = params.get("score_threshold", 0.0) score_threshold = params.get("score_threshold", 0.0)
# Get ranker configuration
ranker = params.get("ranker")
if ranker is None:
# Default to RRF with impact_factor=60.0
reranker_type = RERANKER_TYPE_RRF
reranker_params = {"impact_factor": 60.0}
else:
reranker_type = ranker.type
reranker_params = (
{"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
)
query_string = interleaved_content_as_str(query) query_string = interleaved_content_as_str(query)
if mode == "keyword": if mode == "keyword":
return await self.index.query_keyword(query_string, k, score_threshold) return await self.index.query_keyword(query_string, k, score_threshold)
# Calculate embeddings for both vector and hybrid modes
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
if mode == "hybrid":
return await self.index.query_hybrid(
query_vector, query_string, k, score_threshold, reranker_type, reranker_params
)
else: else:
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
return await self.index.query_vector(query_vector, k, score_threshold) return await self.index.query_vector(query_vector, k, score_threshold)

View file

@ -23,6 +23,8 @@ distribution_spec:
- inline::basic - inline::basic
- inline::llm-as-judge - inline::llm-as-judge
- inline::braintrust - inline::braintrust
files:
- inline::localfs
post_training: post_training:
- inline::huggingface - inline::huggingface
tool_runtime: tool_runtime:

View file

@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
ShieldInput, ShieldInput,
ToolGroupInput, ToolGroupInput,
) )
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
@ -29,6 +30,7 @@ def get_distribution_template() -> DistributionTemplate:
"eval": ["inline::meta-reference"], "eval": ["inline::meta-reference"],
"datasetio": ["remote::huggingface", "inline::localfs"], "datasetio": ["remote::huggingface", "inline::localfs"],
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
"files": ["inline::localfs"],
"post_training": ["inline::huggingface"], "post_training": ["inline::huggingface"],
"tool_runtime": [ "tool_runtime": [
"remote::brave-search", "remote::brave-search",
@ -49,6 +51,11 @@ def get_distribution_template() -> DistributionTemplate:
provider_type="inline::faiss", provider_type="inline::faiss",
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
) )
files_provider = Provider(
provider_id="meta-reference-files",
provider_type="inline::localfs",
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
posttraining_provider = Provider( posttraining_provider = Provider(
provider_id="huggingface", provider_id="huggingface",
provider_type="inline::huggingface", provider_type="inline::huggingface",
@ -98,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider],
"vector_io": [vector_io_provider_faiss], "vector_io": [vector_io_provider_faiss],
"files": [files_provider],
"post_training": [posttraining_provider], "post_training": [posttraining_provider],
}, },
default_models=[inference_model, embedding_model], default_models=[inference_model, embedding_model],
@ -107,6 +115,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={ provider_overrides={
"inference": [inference_provider], "inference": [inference_provider],
"vector_io": [vector_io_provider_faiss], "vector_io": [vector_io_provider_faiss],
"files": [files_provider],
"post_training": [posttraining_provider], "post_training": [posttraining_provider],
"safety": [ "safety": [
Provider( Provider(

View file

@ -4,6 +4,7 @@ apis:
- agents - agents
- datasetio - datasetio
- eval - eval
- files
- inference - inference
- post_training - post_training
- safety - safety
@ -84,6 +85,14 @@ providers:
provider_type: inline::braintrust provider_type: inline::braintrust
config: config:
openai_api_key: ${env.OPENAI_API_KEY:} openai_api_key: ${env.OPENAI_API_KEY:}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
post_training: post_training:
- provider_id: huggingface - provider_id: huggingface
provider_type: inline::huggingface provider_type: inline::huggingface

View file

@ -4,6 +4,7 @@ apis:
- agents - agents
- datasetio - datasetio
- eval - eval
- files
- inference - inference
- post_training - post_training
- safety - safety
@ -82,6 +83,14 @@ providers:
provider_type: inline::braintrust provider_type: inline::braintrust
config: config:
openai_api_key: ${env.OPENAI_API_KEY:} openai_api_key: ${env.OPENAI_API_KEY:}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
post_training: post_training:
- provider_id: huggingface - provider_id: huggingface
provider_type: inline::huggingface provider_type: inline::huggingface

View file

@ -17,6 +17,8 @@ distribution_spec:
- inline::sqlite-vec - inline::sqlite-vec
- remote::chromadb - remote::chromadb
- remote::pgvector - remote::pgvector
files:
- inline::localfs
safety: safety:
- inline::llama-guard - inline::llama-guard
agents: agents:

View file

@ -4,6 +4,7 @@ apis:
- agents - agents
- datasetio - datasetio
- eval - eval
- files
- inference - inference
- safety - safety
- scoring - scoring
@ -75,6 +76,14 @@ providers:
db: ${env.PGVECTOR_DB:} db: ${env.PGVECTOR_DB:}
user: ${env.PGVECTOR_USER:} user: ${env.PGVECTOR_USER:}
password: ${env.PGVECTOR_PASSWORD:} password: ${env.PGVECTOR_PASSWORD:}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import (
ShieldInput, ShieldInput,
ToolGroupInput, ToolGroupInput,
) )
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.inference.sentence_transformers import ( from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig, SentenceTransformersInferenceConfig,
) )
@ -134,6 +135,7 @@ def get_distribution_template() -> DistributionTemplate:
providers = { providers = {
"inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]), "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
"vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
"files": ["inline::localfs"],
"safety": ["inline::llama-guard"], "safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"], "agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"],
@ -170,6 +172,11 @@ def get_distribution_template() -> DistributionTemplate:
), ),
), ),
] ]
files_provider = Provider(
provider_id="meta-reference-files",
provider_type="inline::localfs",
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
embedding_provider = Provider( embedding_provider = Provider(
provider_id="sentence-transformers", provider_id="sentence-transformers",
provider_type="inline::sentence-transformers", provider_type="inline::sentence-transformers",
@ -212,6 +219,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_overrides={ provider_overrides={
"inference": inference_providers + [embedding_provider], "inference": inference_providers + [embedding_provider],
"vector_io": vector_io_providers, "vector_io": vector_io_providers,
"files": [files_provider],
}, },
default_models=default_models + [embedding_model], default_models=default_models + [embedding_model],
default_tool_groups=default_tool_groups, default_tool_groups=default_tool_groups,

View file

@ -22,9 +22,6 @@ def provider_from_model(client_with_models, model_id):
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id): def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI completions are not supported when testing with library client yet.")
provider = provider_from_model(client_with_models, model_id) provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in ( if provider.provider_type in (
"inline::meta-reference", "inline::meta-reference",
@ -44,6 +41,23 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
# To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
# Use this to specifically test this API functionality.
# pytest -sv --stack-config="inference=ollama" \
# tests/integration/inference/test_openai_completion.py \
# --text-model qwen2.5-coder:1.5b \
# -k test_openai_completion_non_streaming_suffix
if model_id != "qwen2.5-coder:1.5b":
pytest.skip(f"Suffix is not supported for the model: {model_id}.")
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type != "remote::ollama":
pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id): def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient): if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.") pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
@ -102,6 +116,32 @@ def test_openai_completion_non_streaming(llama_stack_client, client_with_models,
assert len(choice.text) > 10 assert len(choice.text) > 10
@pytest.mark.parametrize(
"test_case",
[
"inference:completion:suffix",
],
)
def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
skip_if_model_doesnt_support_suffix(client_with_models, text_model_id)
tc = TestCase(test_case)
# ollama needs more verbose prompting for some reason here...
response = llama_stack_client.completions.create(
model=text_model_id,
prompt=tc["content"],
stream=False,
suffix=tc["suffix"],
max_tokens=10,
)
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.text) > 5
assert "france" in choice.text.lower()
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_case", "test_case",
[ [

View file

@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
"remote::runpod", "remote::runpod",
"remote::sambanova", "remote::sambanova",
"remote::tgi", "remote::tgi",
"remote::ollama",
): ):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.") pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")

View file

@ -4,6 +4,12 @@
"content": "Complete the sentence using one word: Roses are red, violets are " "content": "Complete the sentence using one word: Roses are red, violets are "
} }
}, },
"suffix": {
"data": {
"content": "The capital of ",
"suffix": "is Paris."
}
},
"non_streaming": { "non_streaming": {
"data": { "data": {
"content": "Micheael Jordan is born in ", "content": "Micheael Jordan is born in ",

View file

@ -84,6 +84,28 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa
assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}" assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"
@pytest.mark.asyncio
async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings):
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Create a query embedding that's similar to the first chunk
query_embedding = sample_embeddings[0]
query_string = "Sentence 5"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
assert len(response.chunks) == 3, f"Expected 3 results, got {len(response.chunks)}"
# Verify scores are in descending order (higher is better)
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings): async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
# Re-initialize with a clean index # Re-initialize with a clean index
@ -141,3 +163,355 @@ def test_generate_chunk_id():
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36", "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
"f68df25d-d9aa-ab4d-5684-64a233add20d", "f68df25d-d9aa-ab4d-5684-64a233add20d",
] ]
@pytest.mark.asyncio
async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test hybrid search when keyword search returns no matches - should still return vector results."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Use a non-existent keyword but a valid vector query
query_embedding = sample_embeddings[0]
query_string = "Sentence 499"
# First verify keyword search returns no results
keyword_response = await sqlite_vec_index.query_keyword(query_string, k=5, score_threshold=0.0)
assert len(keyword_response.chunks) == 0, "Keyword search should return no results"
# Get hybrid results
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Should still get results from vector search
assert len(response.chunks) > 0, "Should get results from vector search even with no keyword matches"
# Verify scores are in descending order
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
@pytest.mark.asyncio
async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test hybrid search with a high score threshold."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Use a very high score threshold that no results will meet
query_embedding = sample_embeddings[0]
query_string = "Sentence 5"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=1000.0, # Very high threshold
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Should return no results due to high threshold
assert len(response.chunks) == 0
@pytest.mark.asyncio
async def test_query_chunks_hybrid_different_embedding(
sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension
):
"""Test hybrid search with a different embedding than the stored ones."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Create a random embedding that's different from stored ones
query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
query_string = "Sentence 5"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Should still get results if keyword matches exist
assert len(response.chunks) > 0
# Verify scores are in descending order
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
@pytest.mark.asyncio
async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test that RRF properly combines rankings when documents appear in both search methods."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Create a query embedding that's similar to the first chunk
query_embedding = sample_embeddings[0]
# Use a keyword that appears in multiple documents
query_string = "Sentence 5"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=5,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Verify we get results from both search methods
assert len(response.chunks) > 0
# Verify scores are in descending order (RRF should maintain this)
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
@pytest.mark.asyncio
async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings):
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Create a query embedding that's similar to the first chunk
query_embedding = sample_embeddings[0]
# Use a keyword that appears in the first document
query_string = "Sentence 0 from document 0"
# Test weighted re-ranking
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="weighted",
reranker_params={"alpha": 0.5},
)
assert len(response.chunks) == 1
# Score should be weighted average of normalized keyword score and vector score
assert response.scores[0] > 0.5 # Both scores should be high
# Test RRF re-ranking
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
assert len(response.chunks) == 1
# RRF score should be sum of reciprocal ranks
assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # 1/(60+1) + 1/(60+1)
# Test default re-ranking (should be RRF)
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
assert len(response.chunks) == 1
assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # Should behave like RRF
@pytest.mark.asyncio
async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test hybrid search with documents that appear in only one search method."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# Create a query embedding that's similar to the first chunk
query_embedding = sample_embeddings[0]
# Use a keyword that appears in a different document
query_string = "Sentence 9 from document 2"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Should get results from both search methods
assert len(response.chunks) > 0
# Verify scores are in descending order
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
# Verify we get results from both the vector-similar document and keyword-matched document
doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks}
assert "document-0" in doc_ids # From vector search
assert "document-2" in doc_ids # From keyword search
@pytest.mark.asyncio
async def test_query_chunks_hybrid_weighted_reranker_parametrization(
sqlite_vec_index, sample_chunks, sample_embeddings
):
"""Test WeightedReRanker with different alpha values."""
# Re-add data before each search to ensure test isolation
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
query_embedding = sample_embeddings[0]
query_string = "Sentence 0 from document 0"
# alpha=1.0 (should behave like pure keyword)
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="weighted",
reranker_params={"alpha": 1.0},
)
assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
# alpha=0.0 (should behave like pure vector)
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="weighted",
reranker_params={"alpha": 0.0},
)
assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# alpha=0.7 (should be a mix)
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="weighted",
reranker_params={"alpha": 0.7},
)
assert len(response.chunks) > 0 # Should get at least one result
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
@pytest.mark.asyncio
async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings):
"""Test RRFReRanker with different impact factors."""
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
query_embedding = sample_embeddings[0]
query_string = "Sentence 0 from document 0"
# impact_factor=10
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 10.0},
)
assert len(response.chunks) == 1
assert response.scores[0] == pytest.approx(2.0 / 11.0, rel=1e-6)
# impact_factor=100
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=1,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 100.0},
)
assert len(response.chunks) == 1
assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6)
@pytest.mark.asyncio
async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings):
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
# No results from either search - use a completely different embedding and a nonzero threshold
query_embedding = np.ones_like(sample_embeddings[0]) * -1 # Very different from sample embeddings
query_string = "no_such_keyword_that_will_never_match"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=0.1, # Nonzero threshold to filter out low-similarity matches
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
assert len(response.chunks) == 0
# All results below threshold
query_embedding = sample_embeddings[0]
query_string = "Sentence 0 from document 0"
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=3,
score_threshold=1000.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
assert len(response.chunks) == 0
# Large k value
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=100,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
# Should not error, should return all available results
assert len(response.chunks) > 0
assert len(response.chunks) <= 100
@pytest.mark.asyncio
async def test_query_chunks_hybrid_tie_breaking(
sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
):
"""Test tie-breaking and determinism when scores are equal."""
# Create two chunks with the same content and embedding
chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
chunks = [chunk1, chunk2]
# Use the same embedding for both chunks to ensure equal scores
same_embedding = sample_embeddings[0]
embeddings = np.array([same_embedding, same_embedding])
# Clear existing data and recreate index
await sqlite_vec_index.delete()
temp_dir = tmp_path_factory.getbasetemp()
db_path = str(temp_dir / "test_sqlite.db")
sqlite_vec_index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank")
await sqlite_vec_index.add_chunks(chunks, embeddings)
# Query with the same embedding and content to ensure equal scores
query_embedding = same_embedding
query_string = "identical"
# Run multiple queries to verify determinism
responses = []
for _ in range(3):
response = await sqlite_vec_index.query_hybrid(
embedding=query_embedding,
query_string=query_string,
k=2,
score_threshold=0.0,
reranker_type="rrf",
reranker_params={"impact_factor": 60.0},
)
responses.append(response)
# Verify all responses are identical
first_response = responses[0]
for response in responses[1:]:
assert response.chunks == first_response.chunks
assert response.scores == first_response.scores
# Verify both chunks are returned with equal scores
assert len(first_response.chunks) == 2
assert first_response.scores[0] == first_response.scores[1]
assert {chunk.metadata["document_id"] for chunk in first_response.chunks} == {"docA", "docB"}

View file

@ -31,6 +31,25 @@ test_response_web_search:
search_context_size: "low" search_context_size: "low"
output: "128" output: "128"
test_response_file_search:
test_name: test_response_file_search
test_params:
case:
- case_id: "llama_experts"
input: "How many experts does the Llama 4 Maverick model have?"
tools:
- type: file_search
# vector_store_ids param for file_search tool gets added by the test runner
file_content: "Llama 4 Maverick has 128 experts"
output: "128"
- case_id: "llama_experts_pdf"
input: "How many experts does the Llama 4 Maverick model have?"
tools:
- type: file_search
# vector_store_ids param for file_search toolgets added by the test runner
file_path: "pdfs/llama_stack_and_models.pdf"
output: "128"
test_response_mcp_tool: test_response_mcp_tool:
test_name: test_response_mcp_tool test_name: test_response_mcp_tool
test_params: test_params:

View file

@ -5,6 +5,8 @@
# the root directory of this source tree. # the root directory of this source tree.
import json import json
import os
import time
import httpx import httpx
import openai import openai
@ -23,6 +25,31 @@ from tests.verifications.openai_api.fixtures.load import load_test_cases
responses_test_cases = load_test_cases("responses") responses_test_cases = load_test_cases("responses")
def _new_vector_store(openai_client, name):
# Ensure we don't reuse an existing vector store
vector_stores = openai_client.vector_stores.list()
for vector_store in vector_stores:
if vector_store.name == name:
openai_client.vector_stores.delete(vector_store_id=vector_store.id)
# Create a new vector store
vector_store = openai_client.vector_stores.create(
name=name,
)
return vector_store
def _upload_file(openai_client, name, file_path):
# Ensure we don't reuse an existing file
files = openai_client.files.list()
for file in files:
if file.filename == name:
openai_client.files.delete(file_id=file.id)
# Upload a text file with our document content
return openai_client.files.create(file=open(file_path, "rb"), purpose="assistants")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"case", "case",
responses_test_cases["test_response_basic"]["test_params"]["case"], responses_test_cases["test_response_basic"]["test_params"]["case"],
@ -258,6 +285,111 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
assert case["output"].lower() in response.output_text.lower().strip() assert case["output"].lower() in response.output_text.lower().strip()
@pytest.mark.parametrize(
"case",
responses_test_cases["test_response_file_search"]["test_params"]["case"],
ids=case_id_generator,
)
def test_response_non_streaming_file_search(
request, openai_client, model, provider, verification_config, tmp_path, case
):
if isinstance(openai_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API file search is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
vector_store = _new_vector_store(openai_client, "test_vector_store")
if "file_content" in case:
file_name = "test_response_non_streaming_file_search.txt"
file_path = tmp_path / file_name
file_path.write_text(case["file_content"])
elif "file_path" in case:
file_path = os.path.join(os.path.dirname(__file__), "fixtures", case["file_path"])
file_name = os.path.basename(file_path)
else:
raise ValueError(f"No file content or path provided for case {case['case_id']}")
file_response = _upload_file(openai_client, file_name, file_path)
# Attach our file to the vector store
file_attach_response = openai_client.vector_stores.files.create(
vector_store_id=vector_store.id,
file_id=file_response.id,
)
# Wait for the file to be attached
while file_attach_response.status == "in_progress":
time.sleep(0.1)
file_attach_response = openai_client.vector_stores.files.retrieve(
vector_store_id=vector_store.id,
file_id=file_response.id,
)
assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}"
assert not file_attach_response.last_error
# Update our tools with the right vector store id
tools = case["tools"]
for tool in tools:
if tool["type"] == "file_search":
tool["vector_store_ids"] = [vector_store.id]
# Create the response request, which should query our vector store
response = openai_client.responses.create(
model=model,
input=case["input"],
tools=tools,
stream=False,
include=["file_search_call.results"],
)
# Verify the file_search_tool was called
assert len(response.output) > 1
assert response.output[0].type == "file_search_call"
assert response.output[0].status == "completed"
assert response.output[0].queries # ensure it's some non-empty list
assert response.output[0].results
assert case["output"].lower() in response.output[0].results[0].text.lower()
assert response.output[0].results[0].score > 0
# Verify the output_text generated by the response
assert case["output"].lower() in response.output_text.lower().strip()
def test_response_non_streaming_file_search_empty_vector_store(
request, openai_client, model, provider, verification_config
):
if isinstance(openai_client, LlamaStackAsLibraryClient):
pytest.skip("Responses API file search is not yet supported in library client.")
test_name_base = get_base_test_name(request)
if should_skip_test(verification_config, provider, model, test_name_base):
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
vector_store = _new_vector_store(openai_client, "test_vector_store")
# Create the response request, which should query our vector store
response = openai_client.responses.create(
model=model,
input="How many experts does the Llama 4 Maverick model have?",
tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
stream=False,
include=["file_search_call.results"],
)
# Verify the file_search_tool was called
assert len(response.output) > 1
assert response.output[0].type == "file_search_call"
assert response.output[0].status == "completed"
assert response.output[0].queries # ensure it's some non-empty list
assert not response.output[0].results # ensure we don't get any results
# Verify some output_text was generated by the response
assert response.output_text
@pytest.mark.parametrize( @pytest.mark.parametrize(
"case", "case",
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"], responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],