mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 10:54:19 +00:00
Merge branch 'main' into watsonx_hc
This commit is contained in:
commit
f5388e252d
48 changed files with 2179 additions and 66 deletions
450
docs/_static/llama-stack-spec.html
vendored
450
docs/_static/llama-stack-spec.html
vendored
|
@ -3240,6 +3240,59 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/openai/v1/vector_stores/{vector_store_id}/files": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "A VectorStoreFileObject representing the attached file.",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/VectorStoreFileObject"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"VectorIO"
|
||||||
|
],
|
||||||
|
"description": "Attach a file to a vector store.",
|
||||||
|
"parameters": [
|
||||||
|
{
|
||||||
|
"name": "vector_store_id",
|
||||||
|
"in": "path",
|
||||||
|
"description": "The ID of the vector store to attach the file to.",
|
||||||
|
"required": true,
|
||||||
|
"schema": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenaiAttachFileToVectorStoreRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/v1/openai/v1/completions": {
|
"/v1/openai/v1/completions": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -7047,6 +7100,9 @@
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
|
||||||
},
|
},
|
||||||
|
@ -7193,12 +7249,41 @@
|
||||||
"const": "file_search",
|
"const": "file_search",
|
||||||
"default": "file_search"
|
"default": "file_search"
|
||||||
},
|
},
|
||||||
"vector_store_id": {
|
"vector_store_ids": {
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "string"
|
"type": "string"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"filters": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"max_num_results": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 10
|
||||||
|
},
|
||||||
"ranking_options": {
|
"ranking_options": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -7217,7 +7302,7 @@
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"type",
|
"type",
|
||||||
"vector_store_id"
|
"vector_store_ids"
|
||||||
],
|
],
|
||||||
"title": "OpenAIResponseInputToolFileSearch"
|
"title": "OpenAIResponseInputToolFileSearch"
|
||||||
},
|
},
|
||||||
|
@ -7484,6 +7569,64 @@
|
||||||
],
|
],
|
||||||
"title": "OpenAIResponseOutputMessageContentOutputText"
|
"title": "OpenAIResponseOutputMessageContentOutputText"
|
||||||
},
|
},
|
||||||
|
"OpenAIResponseOutputMessageFileSearchToolCall": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"queries": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "file_search_call",
|
||||||
|
"default": "file_search_call"
|
||||||
|
},
|
||||||
|
"results": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"queries",
|
||||||
|
"status",
|
||||||
|
"type"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseOutputMessageFileSearchToolCall"
|
||||||
|
},
|
||||||
"OpenAIResponseOutputMessageFunctionToolCall": {
|
"OpenAIResponseOutputMessageFunctionToolCall": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -7760,6 +7903,9 @@
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
|
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
|
||||||
},
|
},
|
||||||
|
@ -7775,6 +7921,7 @@
|
||||||
"mapping": {
|
"mapping": {
|
||||||
"message": "#/components/schemas/OpenAIResponseMessage",
|
"message": "#/components/schemas/OpenAIResponseMessage",
|
||||||
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
|
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
|
||||||
|
"file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
|
||||||
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
|
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
|
||||||
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
|
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
|
||||||
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
|
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
|
||||||
|
@ -11766,6 +11913,232 @@
|
||||||
],
|
],
|
||||||
"title": "LogEventRequest"
|
"title": "LogEventRequest"
|
||||||
},
|
},
|
||||||
|
"VectorStoreChunkingStrategy": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/VectorStoreChunkingStrategyAuto"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/VectorStoreChunkingStrategyStatic"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discriminator": {
|
||||||
|
"propertyName": "type",
|
||||||
|
"mapping": {
|
||||||
|
"auto": "#/components/schemas/VectorStoreChunkingStrategyAuto",
|
||||||
|
"static": "#/components/schemas/VectorStoreChunkingStrategyStatic"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"VectorStoreChunkingStrategyAuto": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "auto",
|
||||||
|
"default": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type"
|
||||||
|
],
|
||||||
|
"title": "VectorStoreChunkingStrategyAuto"
|
||||||
|
},
|
||||||
|
"VectorStoreChunkingStrategyStatic": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "static",
|
||||||
|
"default": "static"
|
||||||
|
},
|
||||||
|
"static": {
|
||||||
|
"$ref": "#/components/schemas/VectorStoreChunkingStrategyStaticConfig"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"static"
|
||||||
|
],
|
||||||
|
"title": "VectorStoreChunkingStrategyStatic"
|
||||||
|
},
|
||||||
|
"VectorStoreChunkingStrategyStaticConfig": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"chunk_overlap_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 400
|
||||||
|
},
|
||||||
|
"max_chunk_size_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 800
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"chunk_overlap_tokens",
|
||||||
|
"max_chunk_size_tokens"
|
||||||
|
],
|
||||||
|
"title": "VectorStoreChunkingStrategyStaticConfig"
|
||||||
|
},
|
||||||
|
"OpenaiAttachFileToVectorStoreRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"file_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The ID of the file to attach to the vector store."
|
||||||
|
},
|
||||||
|
"attributes": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "The key-value attributes stored with the file, which can be used for filtering."
|
||||||
|
},
|
||||||
|
"chunking_strategy": {
|
||||||
|
"$ref": "#/components/schemas/VectorStoreChunkingStrategy",
|
||||||
|
"description": "The chunking strategy to use for the file."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"file_id"
|
||||||
|
],
|
||||||
|
"title": "OpenaiAttachFileToVectorStoreRequest"
|
||||||
|
},
|
||||||
|
"VectorStoreFileLastError": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"code": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "server_error"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "rate_limit_exceeded"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"message": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"code",
|
||||||
|
"message"
|
||||||
|
],
|
||||||
|
"title": "VectorStoreFileLastError"
|
||||||
|
},
|
||||||
|
"VectorStoreFileObject": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "vector_store.file"
|
||||||
|
},
|
||||||
|
"attributes": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"chunking_strategy": {
|
||||||
|
"$ref": "#/components/schemas/VectorStoreChunkingStrategy"
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"last_error": {
|
||||||
|
"$ref": "#/components/schemas/VectorStoreFileLastError"
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "completed"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "in_progress"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "cancelled"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string",
|
||||||
|
"const": "failed"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"usage_bytes": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 0
|
||||||
|
},
|
||||||
|
"vector_store_id": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"object",
|
||||||
|
"attributes",
|
||||||
|
"chunking_strategy",
|
||||||
|
"created_at",
|
||||||
|
"status",
|
||||||
|
"usage_bytes",
|
||||||
|
"vector_store_id"
|
||||||
|
],
|
||||||
|
"title": "VectorStoreFileObject",
|
||||||
|
"description": "OpenAI Vector Store File object."
|
||||||
|
},
|
||||||
"OpenAIJSONSchema": {
|
"OpenAIJSONSchema": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -12404,6 +12777,10 @@
|
||||||
},
|
},
|
||||||
"prompt_logprobs": {
|
"prompt_logprobs": {
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"suffix": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The suffix that should be appended to the completion."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -13621,7 +13998,11 @@
|
||||||
},
|
},
|
||||||
"mode": {
|
"mode": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Search mode for retrieval—either \"vector\" or \"keyword\". Default \"vector\"."
|
"description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
|
||||||
|
},
|
||||||
|
"ranker": {
|
||||||
|
"$ref": "#/components/schemas/Ranker",
|
||||||
|
"description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
|
@ -13651,6 +14032,69 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"RRFRanker": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "rrf",
|
||||||
|
"default": "rrf",
|
||||||
|
"description": "The type of ranker, always \"rrf\""
|
||||||
|
},
|
||||||
|
"impact_factor": {
|
||||||
|
"type": "number",
|
||||||
|
"default": 60.0,
|
||||||
|
"description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009)."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"impact_factor"
|
||||||
|
],
|
||||||
|
"title": "RRFRanker",
|
||||||
|
"description": "Reciprocal Rank Fusion (RRF) ranker configuration."
|
||||||
|
},
|
||||||
|
"Ranker": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/RRFRanker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/WeightedRanker"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discriminator": {
|
||||||
|
"propertyName": "type",
|
||||||
|
"mapping": {
|
||||||
|
"rrf": "#/components/schemas/RRFRanker",
|
||||||
|
"weighted": "#/components/schemas/WeightedRanker"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"WeightedRanker": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "weighted",
|
||||||
|
"default": "weighted",
|
||||||
|
"description": "The type of ranker, always \"weighted\""
|
||||||
|
},
|
||||||
|
"alpha": {
|
||||||
|
"type": "number",
|
||||||
|
"default": 0.5,
|
||||||
|
"description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"alpha"
|
||||||
|
],
|
||||||
|
"title": "WeightedRanker",
|
||||||
|
"description": "Weighted ranker configuration that combines vector and keyword scores."
|
||||||
|
},
|
||||||
"QueryRequest": {
|
"QueryRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
298
docs/_static/llama-stack-spec.yaml
vendored
298
docs/_static/llama-stack-spec.yaml
vendored
|
@ -2263,6 +2263,43 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/LogEventRequest'
|
$ref: '#/components/schemas/LogEventRequest'
|
||||||
required: true
|
required: true
|
||||||
|
/v1/openai/v1/vector_stores/{vector_store_id}/files:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: >-
|
||||||
|
A VectorStoreFileObject representing the attached file.
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/VectorStoreFileObject'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- VectorIO
|
||||||
|
description: Attach a file to a vector store.
|
||||||
|
parameters:
|
||||||
|
- name: vector_store_id
|
||||||
|
in: path
|
||||||
|
description: >-
|
||||||
|
The ID of the vector store to attach the file to.
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenaiAttachFileToVectorStoreRequest'
|
||||||
|
required: true
|
||||||
/v1/openai/v1/completions:
|
/v1/openai/v1/completions:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -5021,6 +5058,7 @@ components:
|
||||||
OpenAIResponseInput:
|
OpenAIResponseInput:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
|
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseMessage'
|
- $ref: '#/components/schemas/OpenAIResponseMessage'
|
||||||
|
@ -5115,10 +5153,23 @@ components:
|
||||||
type: string
|
type: string
|
||||||
const: file_search
|
const: file_search
|
||||||
default: file_search
|
default: file_search
|
||||||
vector_store_id:
|
vector_store_ids:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: string
|
type: string
|
||||||
|
filters:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
max_num_results:
|
||||||
|
type: integer
|
||||||
|
default: 10
|
||||||
ranking_options:
|
ranking_options:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -5132,7 +5183,7 @@ components:
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- type
|
- type
|
||||||
- vector_store_id
|
- vector_store_ids
|
||||||
title: OpenAIResponseInputToolFileSearch
|
title: OpenAIResponseInputToolFileSearch
|
||||||
OpenAIResponseInputToolFunction:
|
OpenAIResponseInputToolFunction:
|
||||||
type: object
|
type: object
|
||||||
|
@ -5294,6 +5345,41 @@ components:
|
||||||
- type
|
- type
|
||||||
title: >-
|
title: >-
|
||||||
OpenAIResponseOutputMessageContentOutputText
|
OpenAIResponseOutputMessageContentOutputText
|
||||||
|
"OpenAIResponseOutputMessageFileSearchToolCall":
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
queries:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
status:
|
||||||
|
type: string
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: file_search_call
|
||||||
|
default: file_search_call
|
||||||
|
results:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- queries
|
||||||
|
- status
|
||||||
|
- type
|
||||||
|
title: >-
|
||||||
|
OpenAIResponseOutputMessageFileSearchToolCall
|
||||||
"OpenAIResponseOutputMessageFunctionToolCall":
|
"OpenAIResponseOutputMessageFunctionToolCall":
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -5491,6 +5577,7 @@ components:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseMessage'
|
- $ref: '#/components/schemas/OpenAIResponseMessage'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
|
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
|
||||||
|
@ -5499,6 +5586,7 @@ components:
|
||||||
mapping:
|
mapping:
|
||||||
message: '#/components/schemas/OpenAIResponseMessage'
|
message: '#/components/schemas/OpenAIResponseMessage'
|
||||||
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
||||||
|
file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
|
||||||
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
|
||||||
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
|
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
|
||||||
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
|
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
|
||||||
|
@ -8251,6 +8339,148 @@ components:
|
||||||
- event
|
- event
|
||||||
- ttl_seconds
|
- ttl_seconds
|
||||||
title: LogEventRequest
|
title: LogEventRequest
|
||||||
|
VectorStoreChunkingStrategy:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
|
||||||
|
- $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
|
||||||
|
discriminator:
|
||||||
|
propertyName: type
|
||||||
|
mapping:
|
||||||
|
auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
|
||||||
|
static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
|
||||||
|
VectorStoreChunkingStrategyAuto:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: auto
|
||||||
|
default: auto
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
title: VectorStoreChunkingStrategyAuto
|
||||||
|
VectorStoreChunkingStrategyStatic:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: static
|
||||||
|
default: static
|
||||||
|
static:
|
||||||
|
$ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- static
|
||||||
|
title: VectorStoreChunkingStrategyStatic
|
||||||
|
VectorStoreChunkingStrategyStaticConfig:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
chunk_overlap_tokens:
|
||||||
|
type: integer
|
||||||
|
default: 400
|
||||||
|
max_chunk_size_tokens:
|
||||||
|
type: integer
|
||||||
|
default: 800
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- chunk_overlap_tokens
|
||||||
|
- max_chunk_size_tokens
|
||||||
|
title: VectorStoreChunkingStrategyStaticConfig
|
||||||
|
OpenaiAttachFileToVectorStoreRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
file_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
The ID of the file to attach to the vector store.
|
||||||
|
attributes:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: >-
|
||||||
|
The key-value attributes stored with the file, which can be used for filtering.
|
||||||
|
chunking_strategy:
|
||||||
|
$ref: '#/components/schemas/VectorStoreChunkingStrategy'
|
||||||
|
description: >-
|
||||||
|
The chunking strategy to use for the file.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- file_id
|
||||||
|
title: OpenaiAttachFileToVectorStoreRequest
|
||||||
|
VectorStoreFileLastError:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
code:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
const: server_error
|
||||||
|
- type: string
|
||||||
|
const: rate_limit_exceeded
|
||||||
|
message:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- code
|
||||||
|
- message
|
||||||
|
title: VectorStoreFileLastError
|
||||||
|
VectorStoreFileObject:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
object:
|
||||||
|
type: string
|
||||||
|
default: vector_store.file
|
||||||
|
attributes:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
chunking_strategy:
|
||||||
|
$ref: '#/components/schemas/VectorStoreChunkingStrategy'
|
||||||
|
created_at:
|
||||||
|
type: integer
|
||||||
|
last_error:
|
||||||
|
$ref: '#/components/schemas/VectorStoreFileLastError'
|
||||||
|
status:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
const: completed
|
||||||
|
- type: string
|
||||||
|
const: in_progress
|
||||||
|
- type: string
|
||||||
|
const: cancelled
|
||||||
|
- type: string
|
||||||
|
const: failed
|
||||||
|
usage_bytes:
|
||||||
|
type: integer
|
||||||
|
default: 0
|
||||||
|
vector_store_id:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- object
|
||||||
|
- attributes
|
||||||
|
- chunking_strategy
|
||||||
|
- created_at
|
||||||
|
- status
|
||||||
|
- usage_bytes
|
||||||
|
- vector_store_id
|
||||||
|
title: VectorStoreFileObject
|
||||||
|
description: OpenAI Vector Store File object.
|
||||||
OpenAIJSONSchema:
|
OpenAIJSONSchema:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -8673,6 +8903,10 @@ components:
|
||||||
type: string
|
type: string
|
||||||
prompt_logprobs:
|
prompt_logprobs:
|
||||||
type: integer
|
type: integer
|
||||||
|
suffix:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
(Optional) The suffix that should be appended to the completion.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- model
|
- model
|
||||||
|
@ -9526,7 +9760,13 @@ components:
|
||||||
mode:
|
mode:
|
||||||
type: string
|
type: string
|
||||||
description: >-
|
description: >-
|
||||||
Search mode for retrieval—either "vector" or "keyword". Default "vector".
|
Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
|
||||||
|
"vector".
|
||||||
|
ranker:
|
||||||
|
$ref: '#/components/schemas/Ranker'
|
||||||
|
description: >-
|
||||||
|
Configuration for the ranker to use in hybrid search. Defaults to RRF
|
||||||
|
ranker.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- query_generator_config
|
- query_generator_config
|
||||||
|
@ -9545,6 +9785,58 @@ components:
|
||||||
mapping:
|
mapping:
|
||||||
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
|
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
|
||||||
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
|
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
|
||||||
|
RRFRanker:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: rrf
|
||||||
|
default: rrf
|
||||||
|
description: The type of ranker, always "rrf"
|
||||||
|
impact_factor:
|
||||||
|
type: number
|
||||||
|
default: 60.0
|
||||||
|
description: >-
|
||||||
|
The impact factor for RRF scoring. Higher values give more weight to higher-ranked
|
||||||
|
results. Must be greater than 0. Default of 60 is from the original RRF
|
||||||
|
paper (Cormack et al., 2009).
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- impact_factor
|
||||||
|
title: RRFRanker
|
||||||
|
description: >-
|
||||||
|
Reciprocal Rank Fusion (RRF) ranker configuration.
|
||||||
|
Ranker:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/RRFRanker'
|
||||||
|
- $ref: '#/components/schemas/WeightedRanker'
|
||||||
|
discriminator:
|
||||||
|
propertyName: type
|
||||||
|
mapping:
|
||||||
|
rrf: '#/components/schemas/RRFRanker'
|
||||||
|
weighted: '#/components/schemas/WeightedRanker'
|
||||||
|
WeightedRanker:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: weighted
|
||||||
|
default: weighted
|
||||||
|
description: The type of ranker, always "weighted"
|
||||||
|
alpha:
|
||||||
|
type: number
|
||||||
|
default: 0.5
|
||||||
|
description: >-
|
||||||
|
Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
|
||||||
|
only use vector scores, values in between blend both scores.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- alpha
|
||||||
|
title: WeightedRanker
|
||||||
|
description: >-
|
||||||
|
Weighted ranker configuration that combines vector and keyword scores.
|
||||||
QueryRequest:
|
QueryRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
|
@ -18,6 +18,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
|
||||||
| agents | `inline::meta-reference` |
|
| agents | `inline::meta-reference` |
|
||||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||||
| eval | `inline::meta-reference` |
|
| eval | `inline::meta-reference` |
|
||||||
|
| files | `inline::localfs` |
|
||||||
| inference | `remote::ollama` |
|
| inference | `remote::ollama` |
|
||||||
| post_training | `inline::huggingface` |
|
| post_training | `inline::huggingface` |
|
||||||
| safety | `inline::llama-guard` |
|
| safety | `inline::llama-guard` |
|
||||||
|
|
|
@ -66,25 +66,126 @@ To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||||
2. Configure your Llama Stack project to use SQLite-Vec.
|
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||||
3. Start storing and querying vectors.
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
## Supported Search Modes
|
The SQLite-vec provider supports three search modes:
|
||||||
|
|
||||||
The sqlite-vec provider supports both vector-based and keyword-based (full-text) search modes.
|
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
|
||||||
When using the RAGTool interface, you can specify the desired search behavior via the `mode` parameter in
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
|
||||||
`RAGQueryConfig`. For example:
|
|
||||||
|
|
||||||
|
Example with hybrid search:
|
||||||
```python
|
```python
|
||||||
from llama_stack.apis.tool_runtime.rag import RAGQueryConfig
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
|
||||||
query_config = RAGQueryConfig(max_chunks=6, mode="vector")
|
# Using RRF ranker
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "rrf", "impact_factor": 60.0},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
results = client.tool_runtime.rag_tool.query(
|
# Using weighted ranker
|
||||||
vector_db_ids=[vector_db_id],
|
response = await vector_io.query_chunks(
|
||||||
content="what is torchtune",
|
vector_db_id="my_db",
|
||||||
query_config=query_config,
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
|
||||||
|
},
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example with explicit vector search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example with keyword search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Search Modes
|
||||||
|
|
||||||
|
The SQLite vector store supports three search modes:
|
||||||
|
|
||||||
|
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
|
||||||
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|
||||||
|
Hybrid search combines the strengths of both vector and keyword search by:
|
||||||
|
- Computing vector similarity scores
|
||||||
|
- Computing keyword match scores
|
||||||
|
- Using a ranker to combine these scores
|
||||||
|
|
||||||
|
Two ranker types are supported:
|
||||||
|
|
||||||
|
1. **RRF (Reciprocal Rank Fusion)**:
|
||||||
|
- Combines ranks from both vector and keyword results
|
||||||
|
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
|
||||||
|
- Good for balancing between vector and keyword results
|
||||||
|
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
|
||||||
|
|
||||||
|
2. **Weighted**:
|
||||||
|
- Linearly combines normalized vector and keyword scores
|
||||||
|
- Uses an alpha parameter (0-1) to control the blend:
|
||||||
|
- alpha=0: Only use keyword scores
|
||||||
|
- alpha=1: Only use vector scores
|
||||||
|
- alpha=0.5: Equal weight to both (default)
|
||||||
|
|
||||||
|
Example using RAGQueryConfig with different search modes:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
|
||||||
|
|
||||||
|
# Vector search
|
||||||
|
config = RAGQueryConfig(mode="vector", max_chunks=5)
|
||||||
|
|
||||||
|
# Keyword search
|
||||||
|
config = RAGQueryConfig(mode="keyword", max_chunks=5)
|
||||||
|
|
||||||
|
# Hybrid search with custom RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with weighted ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with default RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid", max_chunks=5
|
||||||
|
) # Will use RRF with impact_factor=60.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
You can install SQLite-Vec using pip:
|
You can install SQLite-Vec using pip:
|
||||||
|
@ -96,3 +197,5 @@ pip install sqlite-vec
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
||||||
|
|
||||||
|
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
|
||||||
|
|
|
@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
|
||||||
type: Literal["web_search_call"] = "web_search_call"
|
type: Literal["web_search_call"] = "web_search_call"
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
|
||||||
|
id: str
|
||||||
|
queries: list[str]
|
||||||
|
status: str
|
||||||
|
type: Literal["file_search_call"] = "file_search_call"
|
||||||
|
results: list[dict[str, Any]] | None = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
|
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
|
||||||
call_id: str
|
call_id: str
|
||||||
|
@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel):
|
||||||
OpenAIResponseOutput = Annotated[
|
OpenAIResponseOutput = Annotated[
|
||||||
OpenAIResponseMessage
|
OpenAIResponseMessage
|
||||||
| OpenAIResponseOutputMessageWebSearchToolCall
|
| OpenAIResponseOutputMessageWebSearchToolCall
|
||||||
|
| OpenAIResponseOutputMessageFileSearchToolCall
|
||||||
| OpenAIResponseOutputMessageFunctionToolCall
|
| OpenAIResponseOutputMessageFunctionToolCall
|
||||||
| OpenAIResponseOutputMessageMCPCall
|
| OpenAIResponseOutputMessageMCPCall
|
||||||
| OpenAIResponseOutputMessageMCPListTools,
|
| OpenAIResponseOutputMessageMCPListTools,
|
||||||
|
@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
|
||||||
OpenAIResponseInput = Annotated[
|
OpenAIResponseInput = Annotated[
|
||||||
# Responses API allows output messages to be passed in as input
|
# Responses API allows output messages to be passed in as input
|
||||||
OpenAIResponseOutputMessageWebSearchToolCall
|
OpenAIResponseOutputMessageWebSearchToolCall
|
||||||
|
| OpenAIResponseOutputMessageFileSearchToolCall
|
||||||
| OpenAIResponseOutputMessageFunctionToolCall
|
| OpenAIResponseOutputMessageFunctionToolCall
|
||||||
| OpenAIResponseInputFunctionToolCallOutput
|
| OpenAIResponseInputFunctionToolCallOutput
|
||||||
|
|
|
|
||||||
|
@ -397,9 +408,10 @@ class FileSearchRankingOptions(BaseModel):
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class OpenAIResponseInputToolFileSearch(BaseModel):
|
class OpenAIResponseInputToolFileSearch(BaseModel):
|
||||||
type: Literal["file_search"] = "file_search"
|
type: Literal["file_search"] = "file_search"
|
||||||
vector_store_id: list[str]
|
vector_store_ids: list[str]
|
||||||
|
filters: dict[str, Any] | None = None
|
||||||
|
max_num_results: int | None = Field(default=10, ge=1, le=50)
|
||||||
ranking_options: FileSearchRankingOptions | None = None
|
ranking_options: FileSearchRankingOptions | None = None
|
||||||
# TODO: add filters
|
|
||||||
|
|
||||||
|
|
||||||
class ApprovalFilter(BaseModel):
|
class ApprovalFilter(BaseModel):
|
||||||
|
|
|
@ -1038,6 +1038,8 @@ class InferenceProvider(Protocol):
|
||||||
# vLLM-specific parameters
|
# vLLM-specific parameters
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
# for fill-in-the-middle type completion
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
|
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
|
||||||
|
|
||||||
|
@ -1058,6 +1060,7 @@ class InferenceProvider(Protocol):
|
||||||
:param temperature: (Optional) The temperature to use.
|
:param temperature: (Optional) The temperature to use.
|
||||||
:param top_p: (Optional) The top p to use.
|
:param top_p: (Optional) The top p to use.
|
||||||
:param user: (Optional) The user to use.
|
:param user: (Optional) The user to use.
|
||||||
|
:param suffix: (Optional) The suffix that should be appended to the completion.
|
||||||
:returns: An OpenAICompletion.
|
:returns: An OpenAICompletion.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
@ -15,6 +15,48 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
|
||||||
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class RRFRanker(BaseModel):
|
||||||
|
"""
|
||||||
|
Reciprocal Rank Fusion (RRF) ranker configuration.
|
||||||
|
|
||||||
|
:param type: The type of ranker, always "rrf"
|
||||||
|
:param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
|
||||||
|
Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009).
|
||||||
|
"""
|
||||||
|
|
||||||
|
type: Literal["rrf"] = "rrf"
|
||||||
|
impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class WeightedRanker(BaseModel):
|
||||||
|
"""
|
||||||
|
Weighted ranker configuration that combines vector and keyword scores.
|
||||||
|
|
||||||
|
:param type: The type of ranker, always "weighted"
|
||||||
|
:param alpha: Weight factor between 0 and 1.
|
||||||
|
0 means only use keyword scores,
|
||||||
|
1 means only use vector scores,
|
||||||
|
values in between blend both scores.
|
||||||
|
"""
|
||||||
|
|
||||||
|
type: Literal["weighted"] = "weighted"
|
||||||
|
alpha: float = Field(
|
||||||
|
default=0.5,
|
||||||
|
ge=0.0,
|
||||||
|
le=1.0,
|
||||||
|
description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Ranker = Annotated[
|
||||||
|
RRFRanker | WeightedRanker,
|
||||||
|
Field(discriminator="type"),
|
||||||
|
]
|
||||||
|
register_schema(Ranker, name="Ranker")
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class RAGDocument(BaseModel):
|
class RAGDocument(BaseModel):
|
||||||
"""
|
"""
|
||||||
|
@ -76,7 +118,8 @@ class RAGQueryConfig(BaseModel):
|
||||||
:param chunk_template: Template for formatting each retrieved chunk in the context.
|
:param chunk_template: Template for formatting each retrieved chunk in the context.
|
||||||
Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
|
Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
|
||||||
Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
|
Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
|
||||||
:param mode: Search mode for retrieval—either "vector" or "keyword". Default "vector".
|
:param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
|
||||||
|
:param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# This config defines how a query is generated using the messages
|
# This config defines how a query is generated using the messages
|
||||||
|
@ -86,6 +129,7 @@ class RAGQueryConfig(BaseModel):
|
||||||
max_chunks: int = 5
|
max_chunks: int = 5
|
||||||
chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
|
chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
|
||||||
mode: str | None = None
|
mode: str | None = None
|
||||||
|
ranker: Ranker | None = Field(default=None) # Only used for hybrid mode
|
||||||
|
|
||||||
@field_validator("chunk_template")
|
@field_validator("chunk_template")
|
||||||
def validate_chunk_template(cls, v: str) -> str:
|
def validate_chunk_template(cls, v: str) -> str:
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#
|
#
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
from typing import Any, Literal, Protocol, runtime_checkable
|
from typing import Annotated, Any, Literal, Protocol, runtime_checkable
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ from llama_stack.apis.inference import InterleavedContent
|
||||||
from llama_stack.apis.vector_dbs import VectorDB
|
from llama_stack.apis.vector_dbs import VectorDB
|
||||||
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
|
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
|
||||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||||
|
from llama_stack.strong_typing.schema import register_schema
|
||||||
|
|
||||||
|
|
||||||
class Chunk(BaseModel):
|
class Chunk(BaseModel):
|
||||||
|
@ -133,6 +134,50 @@ class VectorStoreDeleteResponse(BaseModel):
|
||||||
deleted: bool = True
|
deleted: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class VectorStoreChunkingStrategyAuto(BaseModel):
|
||||||
|
type: Literal["auto"] = "auto"
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class VectorStoreChunkingStrategyStaticConfig(BaseModel):
|
||||||
|
chunk_overlap_tokens: int = 400
|
||||||
|
max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class VectorStoreChunkingStrategyStatic(BaseModel):
|
||||||
|
type: Literal["static"] = "static"
|
||||||
|
static: VectorStoreChunkingStrategyStaticConfig
|
||||||
|
|
||||||
|
|
||||||
|
VectorStoreChunkingStrategy = Annotated[
|
||||||
|
VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic, Field(discriminator="type")
|
||||||
|
]
|
||||||
|
register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class VectorStoreFileLastError(BaseModel):
|
||||||
|
code: Literal["server_error"] | Literal["rate_limit_exceeded"]
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class VectorStoreFileObject(BaseModel):
|
||||||
|
"""OpenAI Vector Store File object."""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
object: str = "vector_store.file"
|
||||||
|
attributes: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy
|
||||||
|
created_at: int
|
||||||
|
last_error: VectorStoreFileLastError | None = None
|
||||||
|
status: Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
|
||||||
|
usage_bytes: int = 0
|
||||||
|
vector_store_id: str
|
||||||
|
|
||||||
|
|
||||||
class VectorDBStore(Protocol):
|
class VectorDBStore(Protocol):
|
||||||
def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...
|
def get_vector_db(self, vector_db_id: str) -> VectorDB | None: ...
|
||||||
|
|
||||||
|
@ -290,3 +335,21 @@ class VectorIO(Protocol):
|
||||||
:returns: A VectorStoreSearchResponse containing the search results.
|
:returns: A VectorStoreSearchResponse containing the search results.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST")
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
"""Attach a file to a vector store.
|
||||||
|
|
||||||
|
:param vector_store_id: The ID of the vector store to attach the file to.
|
||||||
|
:param file_id: The ID of the file to attach to the vector store.
|
||||||
|
:param attributes: The key-value attributes stored with the file, which can be used for filtering.
|
||||||
|
:param chunking_strategy: The chunking strategy to use for the file.
|
||||||
|
:returns: A VectorStoreFileObject representing the attached file.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
|
@ -426,6 +426,7 @@ class InferenceRouter(Inference):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
||||||
|
@ -456,6 +457,7 @@ class InferenceRouter(Inference):
|
||||||
user=user,
|
user=user,
|
||||||
guided_choice=guided_choice,
|
guided_choice=guided_choice,
|
||||||
prompt_logprobs=prompt_logprobs,
|
prompt_logprobs=prompt_logprobs,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
|
|
||||||
provider = self.routing_table.get_provider_impl(model_obj.identifier)
|
provider = self.routing_table.get_provider_impl(model_obj.identifier)
|
||||||
|
|
|
@ -19,6 +19,7 @@ from llama_stack.apis.vector_io import (
|
||||||
VectorStoreObject,
|
VectorStoreObject,
|
||||||
VectorStoreSearchResponsePage,
|
VectorStoreSearchResponsePage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import RoutingTable
|
from llama_stack.providers.datatypes import RoutingTable
|
||||||
|
|
||||||
|
@ -254,3 +255,20 @@ class VectorIORouter(VectorIO):
|
||||||
ranking_options=ranking_options,
|
ranking_options=ranking_options,
|
||||||
rewrite_query=rewrite_query,
|
rewrite_query=rewrite_query,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
|
||||||
|
# Route based on vector store ID
|
||||||
|
provider = self.routing_table.get_provider_impl(vector_store_id)
|
||||||
|
return await provider.openai_attach_file_to_vector_store(
|
||||||
|
vector_store_id=vector_store_id,
|
||||||
|
file_id=file_id,
|
||||||
|
attributes=attributes,
|
||||||
|
chunking_strategy=chunking_strategy,
|
||||||
|
)
|
||||||
|
|
|
@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
|
||||||
OpenAIResponseInputMessageContentImage,
|
OpenAIResponseInputMessageContentImage,
|
||||||
OpenAIResponseInputMessageContentText,
|
OpenAIResponseInputMessageContentText,
|
||||||
OpenAIResponseInputTool,
|
OpenAIResponseInputTool,
|
||||||
|
OpenAIResponseInputToolFileSearch,
|
||||||
OpenAIResponseInputToolMCP,
|
OpenAIResponseInputToolMCP,
|
||||||
OpenAIResponseMessage,
|
OpenAIResponseMessage,
|
||||||
OpenAIResponseObject,
|
OpenAIResponseObject,
|
||||||
|
@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import (
|
||||||
OpenAIResponseOutput,
|
OpenAIResponseOutput,
|
||||||
OpenAIResponseOutputMessageContent,
|
OpenAIResponseOutputMessageContent,
|
||||||
OpenAIResponseOutputMessageContentOutputText,
|
OpenAIResponseOutputMessageContentOutputText,
|
||||||
|
OpenAIResponseOutputMessageFileSearchToolCall,
|
||||||
OpenAIResponseOutputMessageFunctionToolCall,
|
OpenAIResponseOutputMessageFunctionToolCall,
|
||||||
OpenAIResponseOutputMessageMCPListTools,
|
OpenAIResponseOutputMessageMCPListTools,
|
||||||
OpenAIResponseOutputMessageWebSearchToolCall,
|
OpenAIResponseOutputMessageWebSearchToolCall,
|
||||||
|
@ -62,7 +64,7 @@ from llama_stack.apis.inference.inference import (
|
||||||
OpenAIToolMessageParam,
|
OpenAIToolMessageParam,
|
||||||
OpenAIUserMessageParam,
|
OpenAIUserMessageParam,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.tools.tools import ToolGroups, ToolRuntime
|
from llama_stack.apis.tools import RAGQueryConfig, ToolGroups, ToolRuntime
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
|
from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
|
||||||
from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
|
from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
|
||||||
|
@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
|
||||||
class ChatCompletionContext(BaseModel):
|
class ChatCompletionContext(BaseModel):
|
||||||
model: str
|
model: str
|
||||||
messages: list[OpenAIMessageParam]
|
messages: list[OpenAIMessageParam]
|
||||||
tools: list[ChatCompletionToolParam] | None = None
|
response_tools: list[OpenAIResponseInputTool] | None = None
|
||||||
|
chat_tools: list[ChatCompletionToolParam] | None = None
|
||||||
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
|
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
|
||||||
temperature: float | None
|
temperature: float | None
|
||||||
response_format: OpenAIResponseFormatParam
|
response_format: OpenAIResponseFormatParam
|
||||||
|
@ -388,7 +391,8 @@ class OpenAIResponsesImpl:
|
||||||
ctx = ChatCompletionContext(
|
ctx = ChatCompletionContext(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=chat_tools,
|
response_tools=tools,
|
||||||
|
chat_tools=chat_tools,
|
||||||
mcp_tool_to_server=mcp_tool_to_server,
|
mcp_tool_to_server=mcp_tool_to_server,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
response_format=response_format,
|
response_format=response_format,
|
||||||
|
@ -417,7 +421,7 @@ class OpenAIResponsesImpl:
|
||||||
completion_result = await self.inference_api.openai_chat_completion(
|
completion_result = await self.inference_api.openai_chat_completion(
|
||||||
model=ctx.model,
|
model=ctx.model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=ctx.tools,
|
tools=ctx.chat_tools,
|
||||||
stream=True,
|
stream=True,
|
||||||
temperature=ctx.temperature,
|
temperature=ctx.temperature,
|
||||||
response_format=ctx.response_format,
|
response_format=ctx.response_format,
|
||||||
|
@ -606,6 +610,12 @@ class OpenAIResponsesImpl:
|
||||||
if not tool:
|
if not tool:
|
||||||
raise ValueError(f"Tool {tool_name} not found")
|
raise ValueError(f"Tool {tool_name} not found")
|
||||||
chat_tools.append(make_openai_tool(tool_name, tool))
|
chat_tools.append(make_openai_tool(tool_name, tool))
|
||||||
|
elif input_tool.type == "file_search":
|
||||||
|
tool_name = "knowledge_search"
|
||||||
|
tool = await self.tool_groups_api.get_tool(tool_name)
|
||||||
|
if not tool:
|
||||||
|
raise ValueError(f"Tool {tool_name} not found")
|
||||||
|
chat_tools.append(make_openai_tool(tool_name, tool))
|
||||||
elif input_tool.type == "mcp":
|
elif input_tool.type == "mcp":
|
||||||
always_allowed = None
|
always_allowed = None
|
||||||
never_allowed = None
|
never_allowed = None
|
||||||
|
@ -667,6 +677,7 @@ class OpenAIResponsesImpl:
|
||||||
|
|
||||||
tool_call_id = tool_call.id
|
tool_call_id = tool_call.id
|
||||||
function = tool_call.function
|
function = tool_call.function
|
||||||
|
tool_kwargs = json.loads(function.arguments) if function.arguments else {}
|
||||||
|
|
||||||
if not function or not tool_call_id or not function.name:
|
if not function or not tool_call_id or not function.name:
|
||||||
return None, None
|
return None, None
|
||||||
|
@ -680,12 +691,26 @@ class OpenAIResponsesImpl:
|
||||||
endpoint=mcp_tool.server_url,
|
endpoint=mcp_tool.server_url,
|
||||||
headers=mcp_tool.headers or {},
|
headers=mcp_tool.headers or {},
|
||||||
tool_name=function.name,
|
tool_name=function.name,
|
||||||
kwargs=json.loads(function.arguments) if function.arguments else {},
|
kwargs=tool_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
if function.name == "knowledge_search":
|
||||||
|
response_file_search_tool = next(
|
||||||
|
t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)
|
||||||
|
)
|
||||||
|
if response_file_search_tool:
|
||||||
|
if response_file_search_tool.filters:
|
||||||
|
logger.warning("Filters are not yet supported for file_search tool")
|
||||||
|
if response_file_search_tool.ranking_options:
|
||||||
|
logger.warning("Ranking options are not yet supported for file_search tool")
|
||||||
|
tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids
|
||||||
|
tool_kwargs["query_config"] = RAGQueryConfig(
|
||||||
|
mode="vector",
|
||||||
|
max_chunks=response_file_search_tool.max_num_results,
|
||||||
|
)
|
||||||
result = await self.tool_runtime_api.invoke_tool(
|
result = await self.tool_runtime_api.invoke_tool(
|
||||||
tool_name=function.name,
|
tool_name=function.name,
|
||||||
kwargs=json.loads(function.arguments) if function.arguments else {},
|
kwargs=tool_kwargs,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_exc = e
|
error_exc = e
|
||||||
|
@ -713,6 +738,27 @@ class OpenAIResponsesImpl:
|
||||||
)
|
)
|
||||||
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
|
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
|
||||||
message.status = "failed"
|
message.status = "failed"
|
||||||
|
elif function.name == "knowledge_search":
|
||||||
|
message = OpenAIResponseOutputMessageFileSearchToolCall(
|
||||||
|
id=tool_call_id,
|
||||||
|
queries=[tool_kwargs.get("query", "")],
|
||||||
|
status="completed",
|
||||||
|
)
|
||||||
|
if "document_ids" in result.metadata:
|
||||||
|
message.results = []
|
||||||
|
for i, doc_id in enumerate(result.metadata["document_ids"]):
|
||||||
|
text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
|
||||||
|
score = result.metadata["scores"][i] if "scores" in result.metadata else None
|
||||||
|
message.results.append(
|
||||||
|
{
|
||||||
|
"file_id": doc_id,
|
||||||
|
"filename": doc_id,
|
||||||
|
"text": text,
|
||||||
|
"score": score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
|
||||||
|
message.status = "failed"
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown tool {function.name} called")
|
raise ValueError(f"Unknown tool {function.name} called")
|
||||||
|
|
||||||
|
|
|
@ -121,8 +121,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
|
||||||
vector_db_id=vector_db_id,
|
vector_db_id=vector_db_id,
|
||||||
query=query,
|
query=query,
|
||||||
params={
|
params={
|
||||||
"max_chunks": query_config.max_chunks,
|
|
||||||
"mode": query_config.mode,
|
"mode": query_config.mode,
|
||||||
|
"max_chunks": query_config.max_chunks,
|
||||||
|
"score_threshold": 0.0,
|
||||||
|
"ranker": query_config.ranker,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
for vector_db_id in vector_db_ids
|
for vector_db_id in vector_db_ids
|
||||||
|
@ -170,6 +172,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
|
||||||
content=picked,
|
content=picked,
|
||||||
metadata={
|
metadata={
|
||||||
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
|
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
|
||||||
|
"chunks": [c.content for c in chunks[: len(picked)]],
|
||||||
|
"scores": scores[: len(picked)],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,6 @@ async def get_provider_impl(config: FaissVectorIOConfig, deps: dict[Api, Any]):
|
||||||
|
|
||||||
assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
|
assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
|
||||||
|
|
||||||
impl = FaissVectorIOAdapter(config, deps[Api.inference])
|
impl = FaissVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
|
||||||
await impl.initialize()
|
await impl.initialize()
|
||||||
return impl
|
return impl
|
||||||
|
|
|
@ -15,6 +15,7 @@ import faiss
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy.typing import NDArray
|
from numpy.typing import NDArray
|
||||||
|
|
||||||
|
from llama_stack.apis.files import Files
|
||||||
from llama_stack.apis.inference import InterleavedContent
|
from llama_stack.apis.inference import InterleavedContent
|
||||||
from llama_stack.apis.inference.inference import Inference
|
from llama_stack.apis.inference.inference import Inference
|
||||||
from llama_stack.apis.vector_dbs import VectorDB
|
from llama_stack.apis.vector_dbs import VectorDB
|
||||||
|
@ -130,11 +131,23 @@ class FaissIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in FAISS")
|
raise NotImplementedError("Keyword search is not supported in FAISS")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in FAISS")
|
||||||
|
|
||||||
|
|
||||||
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
||||||
def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None:
|
def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
self.inference_api = inference_api
|
self.inference_api = inference_api
|
||||||
|
self.files_api = files_api
|
||||||
self.cache: dict[str, VectorDBWithIndex] = {}
|
self.cache: dict[str, VectorDBWithIndex] = {}
|
||||||
self.kvstore: KVStore | None = None
|
self.kvstore: KVStore | None = None
|
||||||
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
|
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
|
||||||
|
|
|
@ -15,6 +15,6 @@ async def get_provider_impl(config: SQLiteVectorIOConfig, deps: dict[Api, Any]):
|
||||||
from .sqlite_vec import SQLiteVecVectorIOAdapter
|
from .sqlite_vec import SQLiteVecVectorIOAdapter
|
||||||
|
|
||||||
assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
|
assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
|
||||||
impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference])
|
impl = SQLiteVecVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
|
||||||
await impl.initialize()
|
await impl.initialize()
|
||||||
return impl
|
return impl
|
||||||
|
|
|
@ -17,6 +17,7 @@ import numpy as np
|
||||||
import sqlite_vec
|
import sqlite_vec
|
||||||
from numpy.typing import NDArray
|
from numpy.typing import NDArray
|
||||||
|
|
||||||
|
from llama_stack.apis.files.files import Files
|
||||||
from llama_stack.apis.inference.inference import Inference
|
from llama_stack.apis.inference.inference import Inference
|
||||||
from llama_stack.apis.vector_dbs import VectorDB
|
from llama_stack.apis.vector_dbs import VectorDB
|
||||||
from llama_stack.apis.vector_io import (
|
from llama_stack.apis.vector_io import (
|
||||||
|
@ -26,14 +27,20 @@ from llama_stack.apis.vector_io import (
|
||||||
)
|
)
|
||||||
from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
|
from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
|
||||||
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
|
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
|
||||||
from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex
|
from llama_stack.providers.utils.memory.vector_store import (
|
||||||
|
RERANKER_TYPE_RRF,
|
||||||
|
RERANKER_TYPE_WEIGHTED,
|
||||||
|
EmbeddingIndex,
|
||||||
|
VectorDBWithIndex,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Specifying search mode is dependent on the VectorIO provider.
|
# Specifying search mode is dependent on the VectorIO provider.
|
||||||
VECTOR_SEARCH = "vector"
|
VECTOR_SEARCH = "vector"
|
||||||
KEYWORD_SEARCH = "keyword"
|
KEYWORD_SEARCH = "keyword"
|
||||||
SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH}
|
HYBRID_SEARCH = "hybrid"
|
||||||
|
SEARCH_MODES = {VECTOR_SEARCH, KEYWORD_SEARCH, HYBRID_SEARCH}
|
||||||
|
|
||||||
|
|
||||||
def serialize_vector(vector: list[float]) -> bytes:
|
def serialize_vector(vector: list[float]) -> bytes:
|
||||||
|
@ -50,6 +57,59 @@ def _create_sqlite_connection(db_path):
|
||||||
return connection
|
return connection
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
|
||||||
|
"""Normalize scores to [0,1] range using min-max normalization."""
|
||||||
|
if not scores:
|
||||||
|
return {}
|
||||||
|
min_score = min(scores.values())
|
||||||
|
max_score = max(scores.values())
|
||||||
|
score_range = max_score - min_score
|
||||||
|
if score_range > 0:
|
||||||
|
return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
|
||||||
|
return {doc_id: 1.0 for doc_id in scores}
|
||||||
|
|
||||||
|
|
||||||
|
def _weighted_rerank(
|
||||||
|
vector_scores: dict[str, float],
|
||||||
|
keyword_scores: dict[str, float],
|
||||||
|
alpha: float = 0.5,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""ReRanker that uses weighted average of scores."""
|
||||||
|
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
|
||||||
|
normalized_vector_scores = _normalize_scores(vector_scores)
|
||||||
|
normalized_keyword_scores = _normalize_scores(keyword_scores)
|
||||||
|
|
||||||
|
return {
|
||||||
|
doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
|
||||||
|
+ ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
|
||||||
|
for doc_id in all_ids
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _rrf_rerank(
|
||||||
|
vector_scores: dict[str, float],
|
||||||
|
keyword_scores: dict[str, float],
|
||||||
|
impact_factor: float = 60.0,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""ReRanker that uses Reciprocal Rank Fusion."""
|
||||||
|
# Convert scores to ranks
|
||||||
|
vector_ranks = {
|
||||||
|
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
|
||||||
|
}
|
||||||
|
keyword_ranks = {
|
||||||
|
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
|
||||||
|
}
|
||||||
|
|
||||||
|
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
|
||||||
|
rrf_scores = {}
|
||||||
|
for doc_id in all_ids:
|
||||||
|
vector_rank = vector_ranks.get(doc_id, float("inf"))
|
||||||
|
keyword_rank = keyword_ranks.get(doc_id, float("inf"))
|
||||||
|
# RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
|
||||||
|
rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
|
||||||
|
return rrf_scores
|
||||||
|
|
||||||
|
|
||||||
class SQLiteVecIndex(EmbeddingIndex):
|
class SQLiteVecIndex(EmbeddingIndex):
|
||||||
"""
|
"""
|
||||||
An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
|
An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
|
||||||
|
@ -254,8 +314,6 @@ class SQLiteVecIndex(EmbeddingIndex):
|
||||||
"""
|
"""
|
||||||
Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
|
Performs keyword-based search using SQLite FTS5 for relevance-ranked full-text search.
|
||||||
"""
|
"""
|
||||||
if query_string is None:
|
|
||||||
raise ValueError("query_string is required for keyword search.")
|
|
||||||
|
|
||||||
def _execute_query():
|
def _execute_query():
|
||||||
connection = _create_sqlite_connection(self.db_path)
|
connection = _create_sqlite_connection(self.db_path)
|
||||||
|
@ -293,6 +351,81 @@ class SQLiteVecIndex(EmbeddingIndex):
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
return QueryChunksResponse(chunks=chunks, scores=scores)
|
return QueryChunksResponse(chunks=chunks, scores=scores)
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str = RERANKER_TYPE_RRF,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
"""
|
||||||
|
Hybrid search using a configurable re-ranking strategy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: The query embedding vector
|
||||||
|
query_string: The text query for keyword search
|
||||||
|
k: Number of results to return
|
||||||
|
score_threshold: Minimum similarity score threshold
|
||||||
|
reranker_type: Type of reranker to use ("rrf" or "weighted")
|
||||||
|
reranker_params: Parameters for the reranker
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
QueryChunksResponse with combined results
|
||||||
|
"""
|
||||||
|
if reranker_params is None:
|
||||||
|
reranker_params = {}
|
||||||
|
|
||||||
|
# Get results from both search methods
|
||||||
|
vector_response = await self.query_vector(embedding, k, score_threshold)
|
||||||
|
keyword_response = await self.query_keyword(query_string, k, score_threshold)
|
||||||
|
|
||||||
|
# Convert responses to score dictionaries using generate_chunk_id
|
||||||
|
vector_scores = {
|
||||||
|
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
|
||||||
|
for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
|
||||||
|
}
|
||||||
|
keyword_scores = {
|
||||||
|
generate_chunk_id(chunk.metadata["document_id"], str(chunk.content)): score
|
||||||
|
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Combine scores using the specified reranker
|
||||||
|
if reranker_type == RERANKER_TYPE_WEIGHTED:
|
||||||
|
alpha = reranker_params.get("alpha", 0.5)
|
||||||
|
combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
|
||||||
|
else:
|
||||||
|
# Default to RRF for None, RRF, or any unknown types
|
||||||
|
impact_factor = reranker_params.get("impact_factor", 60.0)
|
||||||
|
combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
|
||||||
|
|
||||||
|
# Sort by combined score and get top k results
|
||||||
|
sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
top_k_items = sorted_items[:k]
|
||||||
|
|
||||||
|
# Filter by score threshold
|
||||||
|
filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
|
||||||
|
|
||||||
|
# Create a map of chunk_id to chunk for both responses
|
||||||
|
chunk_map = {}
|
||||||
|
for c in vector_response.chunks:
|
||||||
|
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
|
||||||
|
chunk_map[chunk_id] = c
|
||||||
|
for c in keyword_response.chunks:
|
||||||
|
chunk_id = generate_chunk_id(c.metadata["document_id"], str(c.content))
|
||||||
|
chunk_map[chunk_id] = c
|
||||||
|
|
||||||
|
# Use the map to look up chunks by their IDs
|
||||||
|
chunks = []
|
||||||
|
scores = []
|
||||||
|
for doc_id, score in filtered_items:
|
||||||
|
if doc_id in chunk_map:
|
||||||
|
chunks.append(chunk_map[doc_id])
|
||||||
|
scores.append(score)
|
||||||
|
|
||||||
|
return QueryChunksResponse(chunks=chunks, scores=scores)
|
||||||
|
|
||||||
|
|
||||||
class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
||||||
"""
|
"""
|
||||||
|
@ -301,9 +434,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
|
||||||
and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
|
and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config, inference_api: Inference) -> None:
|
def __init__(self, config, inference_api: Inference, files_api: Files | None) -> None:
|
||||||
self.config = config
|
self.config = config
|
||||||
self.inference_api = inference_api
|
self.inference_api = inference_api
|
||||||
|
self.files_api = files_api
|
||||||
self.cache: dict[str, VectorDBWithIndex] = {}
|
self.cache: dict[str, VectorDBWithIndex] = {}
|
||||||
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
|
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
|
||||||
|
|
||||||
|
@ -343,7 +477,9 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
|
||||||
vector_db_data = row[0]
|
vector_db_data = row[0]
|
||||||
vector_db = VectorDB.model_validate_json(vector_db_data)
|
vector_db = VectorDB.model_validate_json(vector_db_data)
|
||||||
index = await SQLiteVecIndex.create(
|
index = await SQLiteVecIndex.create(
|
||||||
vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
|
vector_db.embedding_dimension,
|
||||||
|
self.config.db_path,
|
||||||
|
vector_db.identifier,
|
||||||
)
|
)
|
||||||
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
|
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
|
||||||
|
|
||||||
|
@ -369,7 +505,11 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc
|
||||||
connection.close()
|
connection.close()
|
||||||
|
|
||||||
await asyncio.to_thread(_register_db)
|
await asyncio.to_thread(_register_db)
|
||||||
index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
|
index = await SQLiteVecIndex.create(
|
||||||
|
vector_db.embedding_dimension,
|
||||||
|
self.config.db_path,
|
||||||
|
vector_db.identifier,
|
||||||
|
)
|
||||||
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
|
self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)
|
||||||
|
|
||||||
async def list_vector_dbs(self) -> list[VectorDB]:
|
async def list_vector_dbs(self) -> list[VectorDB]:
|
||||||
|
|
|
@ -24,6 +24,7 @@ def available_providers() -> list[ProviderSpec]:
|
||||||
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
|
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
|
||||||
deprecation_warning="Please use the `inline::faiss` provider instead.",
|
deprecation_warning="Please use the `inline::faiss` provider instead.",
|
||||||
api_dependencies=[Api.inference],
|
api_dependencies=[Api.inference],
|
||||||
|
optional_api_dependencies=[Api.files],
|
||||||
),
|
),
|
||||||
InlineProviderSpec(
|
InlineProviderSpec(
|
||||||
api=Api.vector_io,
|
api=Api.vector_io,
|
||||||
|
@ -32,6 +33,7 @@ def available_providers() -> list[ProviderSpec]:
|
||||||
module="llama_stack.providers.inline.vector_io.faiss",
|
module="llama_stack.providers.inline.vector_io.faiss",
|
||||||
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
|
config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
|
||||||
api_dependencies=[Api.inference],
|
api_dependencies=[Api.inference],
|
||||||
|
optional_api_dependencies=[Api.files],
|
||||||
),
|
),
|
||||||
# NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
|
# NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
|
||||||
# source distribution and the wheels are not available for all platforms.
|
# source distribution and the wheels are not available for all platforms.
|
||||||
|
@ -42,6 +44,7 @@ def available_providers() -> list[ProviderSpec]:
|
||||||
module="llama_stack.providers.inline.vector_io.sqlite_vec",
|
module="llama_stack.providers.inline.vector_io.sqlite_vec",
|
||||||
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
|
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
|
||||||
api_dependencies=[Api.inference],
|
api_dependencies=[Api.inference],
|
||||||
|
optional_api_dependencies=[Api.files],
|
||||||
),
|
),
|
||||||
InlineProviderSpec(
|
InlineProviderSpec(
|
||||||
api=Api.vector_io,
|
api=Api.vector_io,
|
||||||
|
@ -51,6 +54,7 @@ def available_providers() -> list[ProviderSpec]:
|
||||||
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
|
config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig",
|
||||||
deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
|
deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.",
|
||||||
api_dependencies=[Api.inference],
|
api_dependencies=[Api.inference],
|
||||||
|
optional_api_dependencies=[Api.files],
|
||||||
),
|
),
|
||||||
remote_provider_spec(
|
remote_provider_spec(
|
||||||
Api.vector_io,
|
Api.vector_io,
|
||||||
|
|
|
@ -318,6 +318,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
model_obj = await self.model_store.get_model(model)
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
|
||||||
|
|
|
@ -316,6 +316,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
provider_model_id = await self._get_provider_model_id(model)
|
provider_model_id = await self._get_provider_model_id(model)
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,6 @@ from llama_stack.apis.inference import (
|
||||||
JsonSchemaResponseFormat,
|
JsonSchemaResponseFormat,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
OpenAIEmbeddingsResponse,
|
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
SamplingParams,
|
SamplingParams,
|
||||||
TextTruncation,
|
TextTruncation,
|
||||||
|
@ -46,6 +45,8 @@ from llama_stack.apis.inference.inference import (
|
||||||
OpenAIChatCompletion,
|
OpenAIChatCompletion,
|
||||||
OpenAIChatCompletionChunk,
|
OpenAIChatCompletionChunk,
|
||||||
OpenAICompletion,
|
OpenAICompletion,
|
||||||
|
OpenAIEmbeddingsResponse,
|
||||||
|
OpenAIEmbeddingUsage,
|
||||||
OpenAIMessageParam,
|
OpenAIMessageParam,
|
||||||
OpenAIResponseFormatParam,
|
OpenAIResponseFormatParam,
|
||||||
)
|
)
|
||||||
|
@ -62,8 +63,10 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
OpenAICompatCompletionChoice,
|
OpenAICompatCompletionChoice,
|
||||||
OpenAICompatCompletionResponse,
|
OpenAICompatCompletionResponse,
|
||||||
|
b64_encode_openai_embeddings_response,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
prepare_openai_completion_params,
|
prepare_openai_completion_params,
|
||||||
|
prepare_openai_embeddings_params,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
process_completion_response,
|
process_completion_response,
|
||||||
|
@ -386,7 +389,35 @@ class OllamaInferenceAdapter(
|
||||||
dimensions: int | None = None,
|
dimensions: int | None = None,
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
) -> OpenAIEmbeddingsResponse:
|
) -> OpenAIEmbeddingsResponse:
|
||||||
raise NotImplementedError()
|
model_obj = await self._get_model(model)
|
||||||
|
if model_obj.model_type != ModelType.embedding:
|
||||||
|
raise ValueError(f"Model {model} is not an embedding model")
|
||||||
|
|
||||||
|
if model_obj.provider_resource_id is None:
|
||||||
|
raise ValueError(f"Model {model} has no provider_resource_id set")
|
||||||
|
|
||||||
|
# Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters
|
||||||
|
params = prepare_openai_embeddings_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
input=input,
|
||||||
|
encoding_format=encoding_format,
|
||||||
|
dimensions=dimensions,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await self.openai_client.embeddings.create(**params)
|
||||||
|
data = b64_encode_openai_embeddings_response(response.data, encoding_format)
|
||||||
|
|
||||||
|
usage = OpenAIEmbeddingUsage(
|
||||||
|
prompt_tokens=response.usage.prompt_tokens,
|
||||||
|
total_tokens=response.usage.total_tokens,
|
||||||
|
)
|
||||||
|
# TODO: Investigate why model_obj.identifier is used instead of response.model
|
||||||
|
return OpenAIEmbeddingsResponse(
|
||||||
|
data=data,
|
||||||
|
model=model_obj.identifier,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
async def openai_completion(
|
async def openai_completion(
|
||||||
self,
|
self,
|
||||||
|
@ -409,6 +440,7 @@ class OllamaInferenceAdapter(
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
if not isinstance(prompt, str):
|
if not isinstance(prompt, str):
|
||||||
raise ValueError("Ollama does not support non-string prompts for completion")
|
raise ValueError("Ollama does not support non-string prompts for completion")
|
||||||
|
@ -432,6 +464,7 @@ class OllamaInferenceAdapter(
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
user=user,
|
user=user,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
return await self.openai_client.completions.create(**params) # type: ignore
|
return await self.openai_client.completions.create(**params) # type: ignore
|
||||||
|
|
||||||
|
|
|
@ -90,6 +90,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
if guided_choice is not None:
|
if guided_choice is not None:
|
||||||
logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
|
logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
|
||||||
|
@ -117,6 +118,7 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
user=user,
|
user=user,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
return await self._openai_client.completions.create(**params)
|
return await self._openai_client.completions.create(**params)
|
||||||
|
|
||||||
|
|
|
@ -242,6 +242,7 @@ class PassthroughInferenceAdapter(Inference):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
client = self._get_client()
|
client = self._get_client()
|
||||||
model_obj = await self.model_store.get_model(model)
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
|
|
@ -299,6 +299,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
model_obj = await self.model_store.get_model(model)
|
model_obj = await self.model_store.get_model(model)
|
||||||
params = await prepare_openai_completion_params(
|
params = await prepare_openai_completion_params(
|
||||||
|
|
|
@ -559,6 +559,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
self._lazy_initialize_client()
|
self._lazy_initialize_client()
|
||||||
model_obj = await self._get_model(model)
|
model_obj = await self._get_model(model)
|
||||||
|
|
|
@ -313,6 +313,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
model_obj = await self.model_store.get_model(model)
|
model_obj = await self.model_store.get_model(model)
|
||||||
params = await prepare_openai_completion_params(
|
params = await prepare_openai_completion_params(
|
||||||
|
|
|
@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
|
||||||
VectorStoreObject,
|
VectorStoreObject,
|
||||||
VectorStoreSearchResponsePage,
|
VectorStoreSearchResponsePage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
|
||||||
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
||||||
from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
|
from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
|
||||||
from llama_stack.providers.utils.memory.vector_store import (
|
from llama_stack.providers.utils.memory.vector_store import (
|
||||||
|
@ -104,6 +105,17 @@ class ChromaIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in Chroma")
|
raise NotImplementedError("Keyword search is not supported in Chroma")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in Chroma")
|
||||||
|
|
||||||
|
|
||||||
class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -241,3 +253,12 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
||||||
rewrite_query: bool | None = False,
|
rewrite_query: bool | None = False,
|
||||||
) -> VectorStoreSearchResponsePage:
|
) -> VectorStoreSearchResponsePage:
|
||||||
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
|
||||||
|
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma")
|
||||||
|
|
|
@ -25,6 +25,7 @@ from llama_stack.apis.vector_io import (
|
||||||
VectorStoreObject,
|
VectorStoreObject,
|
||||||
VectorStoreSearchResponsePage,
|
VectorStoreSearchResponsePage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
|
||||||
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
||||||
from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
|
from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
|
||||||
from llama_stack.providers.utils.memory.vector_store import (
|
from llama_stack.providers.utils.memory.vector_store import (
|
||||||
|
@ -102,6 +103,17 @@ class MilvusIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in Milvus")
|
raise NotImplementedError("Keyword search is not supported in Milvus")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in Milvus")
|
||||||
|
|
||||||
|
|
||||||
class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -240,6 +252,15 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
||||||
) -> VectorStoreSearchResponsePage:
|
) -> VectorStoreSearchResponsePage:
|
||||||
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
|
||||||
|
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Milvus")
|
||||||
|
|
||||||
|
|
||||||
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
|
def generate_chunk_id(document_id: str, chunk_text: str) -> str:
|
||||||
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
|
"""Generate a unique chunk ID using a hash of document ID and chunk text."""
|
||||||
|
|
|
@ -128,6 +128,17 @@ class PGVectorIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in PGVector")
|
raise NotImplementedError("Keyword search is not supported in PGVector")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in PGVector")
|
||||||
|
|
||||||
async def delete(self):
|
async def delete(self):
|
||||||
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||||
cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
|
cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
|
||||||
|
|
|
@ -23,6 +23,7 @@ from llama_stack.apis.vector_io import (
|
||||||
VectorStoreObject,
|
VectorStoreObject,
|
||||||
VectorStoreSearchResponsePage,
|
VectorStoreSearchResponsePage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.vector_io.vector_io import VectorStoreChunkingStrategy, VectorStoreFileObject
|
||||||
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
|
||||||
from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
|
from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
|
||||||
from llama_stack.providers.utils.memory.vector_store import (
|
from llama_stack.providers.utils.memory.vector_store import (
|
||||||
|
@ -111,6 +112,17 @@ class QdrantIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in Qdrant")
|
raise NotImplementedError("Keyword search is not supported in Qdrant")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in Qdrant")
|
||||||
|
|
||||||
async def delete(self):
|
async def delete(self):
|
||||||
await self.client.delete_collection(collection_name=self.collection_name)
|
await self.client.delete_collection(collection_name=self.collection_name)
|
||||||
|
|
||||||
|
@ -241,3 +253,12 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
|
||||||
rewrite_query: bool | None = False,
|
rewrite_query: bool | None = False,
|
||||||
) -> VectorStoreSearchResponsePage:
|
) -> VectorStoreSearchResponsePage:
|
||||||
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
|
||||||
|
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
raise NotImplementedError("OpenAI Vector Stores API is not supported in Qdrant")
|
||||||
|
|
|
@ -92,6 +92,17 @@ class WeaviateIndex(EmbeddingIndex):
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in Weaviate")
|
raise NotImplementedError("Keyword search is not supported in Weaviate")
|
||||||
|
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError("Hybrid search is not supported in Weaviate")
|
||||||
|
|
||||||
|
|
||||||
class WeaviateVectorIOAdapter(
|
class WeaviateVectorIOAdapter(
|
||||||
VectorIO,
|
VectorIO,
|
||||||
|
|
|
@ -4,8 +4,6 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import base64
|
|
||||||
import struct
|
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import (
|
||||||
OpenAIChatCompletion,
|
OpenAIChatCompletion,
|
||||||
OpenAIChatCompletionChunk,
|
OpenAIChatCompletionChunk,
|
||||||
OpenAICompletion,
|
OpenAICompletion,
|
||||||
OpenAIEmbeddingData,
|
|
||||||
OpenAIEmbeddingsResponse,
|
OpenAIEmbeddingsResponse,
|
||||||
OpenAIEmbeddingUsage,
|
OpenAIEmbeddingUsage,
|
||||||
OpenAIMessageParam,
|
OpenAIMessageParam,
|
||||||
|
@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
b64_encode_openai_embeddings_response,
|
||||||
convert_message_to_openai_dict_new,
|
convert_message_to_openai_dict_new,
|
||||||
convert_openai_chat_completion_choice,
|
convert_openai_chat_completion_choice,
|
||||||
convert_openai_chat_completion_stream,
|
convert_openai_chat_completion_stream,
|
||||||
|
@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Convert response to OpenAI format
|
# Convert response to OpenAI format
|
||||||
data = []
|
data = b64_encode_openai_embeddings_response(response.data, encoding_format)
|
||||||
for i, embedding_data in enumerate(response["data"]):
|
|
||||||
# we encode to base64 if the encoding format is base64 in the request
|
|
||||||
if encoding_format == "base64":
|
|
||||||
byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
|
|
||||||
embedding = base64.b64encode(byte_data).decode("utf-8")
|
|
||||||
else:
|
|
||||||
embedding = embedding_data["embedding"]
|
|
||||||
|
|
||||||
data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
|
|
||||||
|
|
||||||
usage = OpenAIEmbeddingUsage(
|
usage = OpenAIEmbeddingUsage(
|
||||||
prompt_tokens=response["usage"]["prompt_tokens"],
|
prompt_tokens=response["usage"]["prompt_tokens"],
|
||||||
|
@ -336,6 +325,7 @@ class LiteLLMOpenAIMixin(
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
model_obj = await self.model_store.get_model(model)
|
model_obj = await self.model_store.get_model(model)
|
||||||
params = await prepare_openai_completion_params(
|
params = await prepare_openai_completion_params(
|
||||||
|
|
|
@ -3,8 +3,10 @@
|
||||||
#
|
#
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
import base64
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import struct
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import (
|
||||||
OpenAIChatCompletion,
|
OpenAIChatCompletion,
|
||||||
OpenAICompletion,
|
OpenAICompletion,
|
||||||
OpenAICompletionChoice,
|
OpenAICompletionChoice,
|
||||||
|
OpenAIEmbeddingData,
|
||||||
OpenAIMessageParam,
|
OpenAIMessageParam,
|
||||||
OpenAIResponseFormatParam,
|
OpenAIResponseFormatParam,
|
||||||
ToolConfig,
|
ToolConfig,
|
||||||
|
@ -1287,6 +1290,7 @@ class OpenAICompletionToLlamaStackMixin:
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
guided_choice: list[str] | None = None,
|
guided_choice: list[str] | None = None,
|
||||||
prompt_logprobs: int | None = None,
|
prompt_logprobs: int | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> OpenAICompletion:
|
) -> OpenAICompletion:
|
||||||
if stream:
|
if stream:
|
||||||
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
|
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
|
||||||
|
@ -1483,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
||||||
model=model,
|
model=model,
|
||||||
object="chat.completion",
|
object="chat.completion",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_openai_embeddings_params(
|
||||||
|
model: str,
|
||||||
|
input: str | list[str],
|
||||||
|
encoding_format: str | None = "float",
|
||||||
|
dimensions: int | None = None,
|
||||||
|
user: str | None = None,
|
||||||
|
):
|
||||||
|
if model is None:
|
||||||
|
raise ValueError("Model must be provided for embeddings")
|
||||||
|
|
||||||
|
input_list = [input] if isinstance(input, str) else input
|
||||||
|
|
||||||
|
params: dict[str, Any] = {
|
||||||
|
"model": model,
|
||||||
|
"input": input_list,
|
||||||
|
}
|
||||||
|
|
||||||
|
if encoding_format is not None:
|
||||||
|
params["encoding_format"] = encoding_format
|
||||||
|
if dimensions is not None:
|
||||||
|
params["dimensions"] = dimensions
|
||||||
|
if user is not None:
|
||||||
|
params["user"] = user
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def b64_encode_openai_embeddings_response(
|
||||||
|
response_data: dict, encoding_format: str | None = "float"
|
||||||
|
) -> list[OpenAIEmbeddingData]:
|
||||||
|
"""
|
||||||
|
Process the OpenAI embeddings response to encode the embeddings in base64 format if specified.
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for i, embedding_data in enumerate(response_data):
|
||||||
|
if encoding_format == "base64":
|
||||||
|
byte_array = bytearray()
|
||||||
|
for embedding_value in embedding_data.embedding:
|
||||||
|
byte_array.extend(struct.pack("f", float(embedding_value)))
|
||||||
|
|
||||||
|
response_embedding = base64.b64encode(byte_array).decode("utf-8")
|
||||||
|
else:
|
||||||
|
response_embedding = embedding_data.embedding
|
||||||
|
data.append(
|
||||||
|
OpenAIEmbeddingData(
|
||||||
|
embedding=response_embedding,
|
||||||
|
index=i,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
|
@ -5,11 +5,13 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import mimetypes
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from llama_stack.apis.files import Files
|
||||||
from llama_stack.apis.vector_dbs import VectorDB
|
from llama_stack.apis.vector_dbs import VectorDB
|
||||||
from llama_stack.apis.vector_io import (
|
from llama_stack.apis.vector_io import (
|
||||||
QueryChunksResponse,
|
QueryChunksResponse,
|
||||||
|
@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
|
||||||
VectorStoreSearchResponse,
|
VectorStoreSearchResponse,
|
||||||
VectorStoreSearchResponsePage,
|
VectorStoreSearchResponsePage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.vector_io.vector_io import (
|
||||||
|
Chunk,
|
||||||
|
VectorStoreChunkingStrategy,
|
||||||
|
VectorStoreChunkingStrategyAuto,
|
||||||
|
VectorStoreChunkingStrategyStatic,
|
||||||
|
VectorStoreFileLastError,
|
||||||
|
VectorStoreFileObject,
|
||||||
|
)
|
||||||
|
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):
|
||||||
|
|
||||||
# These should be provided by the implementing class
|
# These should be provided by the implementing class
|
||||||
openai_vector_stores: dict[str, dict[str, Any]]
|
openai_vector_stores: dict[str, dict[str, Any]]
|
||||||
|
files_api: Files | None
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
|
async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
|
||||||
|
@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
|
||||||
"""Unregister a vector database (provider-specific implementation)."""
|
"""Unregister a vector database (provider-specific implementation)."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def insert_chunks(
|
||||||
|
self,
|
||||||
|
vector_db_id: str,
|
||||||
|
chunks: list[Chunk],
|
||||||
|
ttl_seconds: int | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Insert chunks into a vector database (provider-specific implementation)."""
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def query_chunks(
|
async def query_chunks(
|
||||||
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
|
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
|
||||||
|
@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC):
|
||||||
if metadata[key] != value:
|
if metadata[key] != value:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
async def openai_attach_file_to_vector_store(
|
||||||
|
self,
|
||||||
|
vector_store_id: str,
|
||||||
|
file_id: str,
|
||||||
|
attributes: dict[str, Any] | None = None,
|
||||||
|
chunking_strategy: VectorStoreChunkingStrategy | None = None,
|
||||||
|
) -> VectorStoreFileObject:
|
||||||
|
attributes = attributes or {}
|
||||||
|
chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
|
||||||
|
|
||||||
|
vector_store_file_object = VectorStoreFileObject(
|
||||||
|
id=file_id,
|
||||||
|
attributes=attributes,
|
||||||
|
chunking_strategy=chunking_strategy,
|
||||||
|
created_at=int(time.time()),
|
||||||
|
status="in_progress",
|
||||||
|
vector_store_id=vector_store_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not hasattr(self, "files_api") or not self.files_api:
|
||||||
|
vector_store_file_object.status = "failed"
|
||||||
|
vector_store_file_object.last_error = VectorStoreFileLastError(
|
||||||
|
code="server_error",
|
||||||
|
message="Files API is not available",
|
||||||
|
)
|
||||||
|
return vector_store_file_object
|
||||||
|
|
||||||
|
if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
|
||||||
|
max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
|
||||||
|
chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
|
||||||
|
else:
|
||||||
|
# Default values from OpenAI API spec
|
||||||
|
max_chunk_size_tokens = 800
|
||||||
|
chunk_overlap_tokens = 400
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_response = await self.files_api.openai_retrieve_file(file_id)
|
||||||
|
mime_type, _ = mimetypes.guess_type(file_response.filename)
|
||||||
|
content_response = await self.files_api.openai_retrieve_file_content(file_id)
|
||||||
|
|
||||||
|
content = content_from_data_and_mime_type(content_response.body, mime_type)
|
||||||
|
|
||||||
|
chunks = make_overlapped_chunks(
|
||||||
|
file_id,
|
||||||
|
content,
|
||||||
|
max_chunk_size_tokens,
|
||||||
|
chunk_overlap_tokens,
|
||||||
|
attributes,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
vector_store_file_object.status = "failed"
|
||||||
|
vector_store_file_object.last_error = VectorStoreFileLastError(
|
||||||
|
code="server_error",
|
||||||
|
message="No chunks were generated from the file",
|
||||||
|
)
|
||||||
|
return vector_store_file_object
|
||||||
|
|
||||||
|
await self.insert_chunks(
|
||||||
|
vector_db_id=vector_store_id,
|
||||||
|
chunks=chunks,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error attaching file to vector store: {e}")
|
||||||
|
vector_store_file_object.status = "failed"
|
||||||
|
vector_store_file_object.last_error = VectorStoreFileLastError(
|
||||||
|
code="server_error",
|
||||||
|
message=str(e),
|
||||||
|
)
|
||||||
|
return vector_store_file_object
|
||||||
|
|
||||||
|
vector_store_file_object.status = "completed"
|
||||||
|
|
||||||
|
return vector_store_file_object
|
||||||
|
|
|
@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Constants for reranker types
|
||||||
|
RERANKER_TYPE_RRF = "rrf"
|
||||||
|
RERANKER_TYPE_WEIGHTED = "weighted"
|
||||||
|
|
||||||
|
|
||||||
def parse_pdf(data: bytes) -> str:
|
def parse_pdf(data: bytes) -> str:
|
||||||
# For PDF and DOC/DOCX files, we can't reliably convert to string
|
# For PDF and DOC/DOCX files, we can't reliably convert to string
|
||||||
|
@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str:
|
||||||
data = unquote(data)
|
data = unquote(data)
|
||||||
encoding = parts["encoding"] or "utf-8"
|
encoding = parts["encoding"] or "utf-8"
|
||||||
data = data.encode(encoding)
|
data = data.encode(encoding)
|
||||||
|
return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))
|
||||||
|
|
||||||
encoding = parts["encoding"]
|
|
||||||
|
def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
|
||||||
|
if isinstance(data, bytes):
|
||||||
if not encoding:
|
if not encoding:
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
detected = chardet.detect(data)
|
detected = chardet.detect(data)
|
||||||
encoding = detected["encoding"]
|
encoding = detected["encoding"]
|
||||||
|
|
||||||
mime_type = parts["mimetype"]
|
mime_category = mime_type.split("/")[0] if mime_type else None
|
||||||
mime_category = mime_type.split("/")[0]
|
|
||||||
if mime_category == "text":
|
if mime_category == "text":
|
||||||
# For text-based files (including CSV, MD)
|
# For text-based files (including CSV, MD)
|
||||||
return data.decode(encoding)
|
return data.decode(encoding)
|
||||||
|
@ -200,6 +206,18 @@ class EmbeddingIndex(ABC):
|
||||||
async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
|
async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def query_hybrid(
|
||||||
|
self,
|
||||||
|
embedding: NDArray,
|
||||||
|
query_string: str,
|
||||||
|
k: int,
|
||||||
|
score_threshold: float,
|
||||||
|
reranker_type: str,
|
||||||
|
reranker_params: dict[str, Any] | None = None,
|
||||||
|
) -> QueryChunksResponse:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def delete(self):
|
async def delete(self):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
@ -243,10 +261,29 @@ class VectorDBWithIndex:
|
||||||
k = params.get("max_chunks", 3)
|
k = params.get("max_chunks", 3)
|
||||||
mode = params.get("mode")
|
mode = params.get("mode")
|
||||||
score_threshold = params.get("score_threshold", 0.0)
|
score_threshold = params.get("score_threshold", 0.0)
|
||||||
|
|
||||||
|
# Get ranker configuration
|
||||||
|
ranker = params.get("ranker")
|
||||||
|
if ranker is None:
|
||||||
|
# Default to RRF with impact_factor=60.0
|
||||||
|
reranker_type = RERANKER_TYPE_RRF
|
||||||
|
reranker_params = {"impact_factor": 60.0}
|
||||||
|
else:
|
||||||
|
reranker_type = ranker.type
|
||||||
|
reranker_params = (
|
||||||
|
{"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
|
||||||
|
)
|
||||||
|
|
||||||
query_string = interleaved_content_as_str(query)
|
query_string = interleaved_content_as_str(query)
|
||||||
if mode == "keyword":
|
if mode == "keyword":
|
||||||
return await self.index.query_keyword(query_string, k, score_threshold)
|
return await self.index.query_keyword(query_string, k, score_threshold)
|
||||||
else:
|
|
||||||
|
# Calculate embeddings for both vector and hybrid modes
|
||||||
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
|
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
|
||||||
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
|
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
|
||||||
|
if mode == "hybrid":
|
||||||
|
return await self.index.query_hybrid(
|
||||||
|
query_vector, query_string, k, score_threshold, reranker_type, reranker_params
|
||||||
|
)
|
||||||
|
else:
|
||||||
return await self.index.query_vector(query_vector, k, score_threshold)
|
return await self.index.query_vector(query_vector, k, score_threshold)
|
||||||
|
|
|
@ -23,6 +23,8 @@ distribution_spec:
|
||||||
- inline::basic
|
- inline::basic
|
||||||
- inline::llm-as-judge
|
- inline::llm-as-judge
|
||||||
- inline::braintrust
|
- inline::braintrust
|
||||||
|
files:
|
||||||
|
- inline::localfs
|
||||||
post_training:
|
post_training:
|
||||||
- inline::huggingface
|
- inline::huggingface
|
||||||
tool_runtime:
|
tool_runtime:
|
||||||
|
|
|
@ -13,6 +13,7 @@ from llama_stack.distribution.datatypes import (
|
||||||
ShieldInput,
|
ShieldInput,
|
||||||
ToolGroupInput,
|
ToolGroupInput,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
|
||||||
from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
|
from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
|
from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
|
||||||
|
@ -29,6 +30,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
"eval": ["inline::meta-reference"],
|
"eval": ["inline::meta-reference"],
|
||||||
"datasetio": ["remote::huggingface", "inline::localfs"],
|
"datasetio": ["remote::huggingface", "inline::localfs"],
|
||||||
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
|
"scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
|
||||||
|
"files": ["inline::localfs"],
|
||||||
"post_training": ["inline::huggingface"],
|
"post_training": ["inline::huggingface"],
|
||||||
"tool_runtime": [
|
"tool_runtime": [
|
||||||
"remote::brave-search",
|
"remote::brave-search",
|
||||||
|
@ -49,6 +51,11 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
provider_type="inline::faiss",
|
provider_type="inline::faiss",
|
||||||
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
||||||
)
|
)
|
||||||
|
files_provider = Provider(
|
||||||
|
provider_id="meta-reference-files",
|
||||||
|
provider_type="inline::localfs",
|
||||||
|
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
||||||
|
)
|
||||||
posttraining_provider = Provider(
|
posttraining_provider = Provider(
|
||||||
provider_id="huggingface",
|
provider_id="huggingface",
|
||||||
provider_type="inline::huggingface",
|
provider_type="inline::huggingface",
|
||||||
|
@ -98,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
provider_overrides={
|
provider_overrides={
|
||||||
"inference": [inference_provider],
|
"inference": [inference_provider],
|
||||||
"vector_io": [vector_io_provider_faiss],
|
"vector_io": [vector_io_provider_faiss],
|
||||||
|
"files": [files_provider],
|
||||||
"post_training": [posttraining_provider],
|
"post_training": [posttraining_provider],
|
||||||
},
|
},
|
||||||
default_models=[inference_model, embedding_model],
|
default_models=[inference_model, embedding_model],
|
||||||
|
@ -107,6 +115,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
provider_overrides={
|
provider_overrides={
|
||||||
"inference": [inference_provider],
|
"inference": [inference_provider],
|
||||||
"vector_io": [vector_io_provider_faiss],
|
"vector_io": [vector_io_provider_faiss],
|
||||||
|
"files": [files_provider],
|
||||||
"post_training": [posttraining_provider],
|
"post_training": [posttraining_provider],
|
||||||
"safety": [
|
"safety": [
|
||||||
Provider(
|
Provider(
|
||||||
|
|
|
@ -4,6 +4,7 @@ apis:
|
||||||
- agents
|
- agents
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
- safety
|
- safety
|
||||||
|
@ -84,6 +85,14 @@ providers:
|
||||||
provider_type: inline::braintrust
|
provider_type: inline::braintrust
|
||||||
config:
|
config:
|
||||||
openai_api_key: ${env.OPENAI_API_KEY:}
|
openai_api_key: ${env.OPENAI_API_KEY:}
|
||||||
|
files:
|
||||||
|
- provider_id: meta-reference-files
|
||||||
|
provider_type: inline::localfs
|
||||||
|
config:
|
||||||
|
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
|
||||||
post_training:
|
post_training:
|
||||||
- provider_id: huggingface
|
- provider_id: huggingface
|
||||||
provider_type: inline::huggingface
|
provider_type: inline::huggingface
|
||||||
|
|
|
@ -4,6 +4,7 @@ apis:
|
||||||
- agents
|
- agents
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- files
|
||||||
- inference
|
- inference
|
||||||
- post_training
|
- post_training
|
||||||
- safety
|
- safety
|
||||||
|
@ -82,6 +83,14 @@ providers:
|
||||||
provider_type: inline::braintrust
|
provider_type: inline::braintrust
|
||||||
config:
|
config:
|
||||||
openai_api_key: ${env.OPENAI_API_KEY:}
|
openai_api_key: ${env.OPENAI_API_KEY:}
|
||||||
|
files:
|
||||||
|
- provider_id: meta-reference-files
|
||||||
|
provider_type: inline::localfs
|
||||||
|
config:
|
||||||
|
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/ollama/files}
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/files_metadata.db
|
||||||
post_training:
|
post_training:
|
||||||
- provider_id: huggingface
|
- provider_id: huggingface
|
||||||
provider_type: inline::huggingface
|
provider_type: inline::huggingface
|
||||||
|
|
|
@ -17,6 +17,8 @@ distribution_spec:
|
||||||
- inline::sqlite-vec
|
- inline::sqlite-vec
|
||||||
- remote::chromadb
|
- remote::chromadb
|
||||||
- remote::pgvector
|
- remote::pgvector
|
||||||
|
files:
|
||||||
|
- inline::localfs
|
||||||
safety:
|
safety:
|
||||||
- inline::llama-guard
|
- inline::llama-guard
|
||||||
agents:
|
agents:
|
||||||
|
|
|
@ -4,6 +4,7 @@ apis:
|
||||||
- agents
|
- agents
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
|
- files
|
||||||
- inference
|
- inference
|
||||||
- safety
|
- safety
|
||||||
- scoring
|
- scoring
|
||||||
|
@ -75,6 +76,14 @@ providers:
|
||||||
db: ${env.PGVECTOR_DB:}
|
db: ${env.PGVECTOR_DB:}
|
||||||
user: ${env.PGVECTOR_USER:}
|
user: ${env.PGVECTOR_USER:}
|
||||||
password: ${env.PGVECTOR_PASSWORD:}
|
password: ${env.PGVECTOR_PASSWORD:}
|
||||||
|
files:
|
||||||
|
- provider_id: meta-reference-files
|
||||||
|
provider_type: inline::localfs
|
||||||
|
config:
|
||||||
|
storage_dir: ${env.FILES_STORAGE_DIR:~/.llama/distributions/starter/files}
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/starter}/files_metadata.db
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
@ -12,6 +12,7 @@ from llama_stack.distribution.datatypes import (
|
||||||
ShieldInput,
|
ShieldInput,
|
||||||
ToolGroupInput,
|
ToolGroupInput,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
|
||||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
SentenceTransformersInferenceConfig,
|
SentenceTransformersInferenceConfig,
|
||||||
)
|
)
|
||||||
|
@ -134,6 +135,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
providers = {
|
providers = {
|
||||||
"inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
|
"inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
|
||||||
"vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
|
"vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"files": ["inline::localfs"],
|
||||||
"safety": ["inline::llama-guard"],
|
"safety": ["inline::llama-guard"],
|
||||||
"agents": ["inline::meta-reference"],
|
"agents": ["inline::meta-reference"],
|
||||||
"telemetry": ["inline::meta-reference"],
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
@ -170,6 +172,11 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
files_provider = Provider(
|
||||||
|
provider_id="meta-reference-files",
|
||||||
|
provider_type="inline::localfs",
|
||||||
|
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
|
||||||
|
)
|
||||||
embedding_provider = Provider(
|
embedding_provider = Provider(
|
||||||
provider_id="sentence-transformers",
|
provider_id="sentence-transformers",
|
||||||
provider_type="inline::sentence-transformers",
|
provider_type="inline::sentence-transformers",
|
||||||
|
@ -212,6 +219,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
provider_overrides={
|
provider_overrides={
|
||||||
"inference": inference_providers + [embedding_provider],
|
"inference": inference_providers + [embedding_provider],
|
||||||
"vector_io": vector_io_providers,
|
"vector_io": vector_io_providers,
|
||||||
|
"files": [files_provider],
|
||||||
},
|
},
|
||||||
default_models=default_models + [embedding_model],
|
default_models=default_models + [embedding_model],
|
||||||
default_tool_groups=default_tool_groups,
|
default_tool_groups=default_tool_groups,
|
||||||
|
|
|
@ -22,9 +22,6 @@ def provider_from_model(client_with_models, model_id):
|
||||||
|
|
||||||
|
|
||||||
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
|
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
|
||||||
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
|
||||||
pytest.skip("OpenAI completions are not supported when testing with library client yet.")
|
|
||||||
|
|
||||||
provider = provider_from_model(client_with_models, model_id)
|
provider = provider_from_model(client_with_models, model_id)
|
||||||
if provider.provider_type in (
|
if provider.provider_type in (
|
||||||
"inline::meta-reference",
|
"inline::meta-reference",
|
||||||
|
@ -44,6 +41,23 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
|
||||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
|
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
|
||||||
|
|
||||||
|
|
||||||
|
def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
|
||||||
|
# To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
|
||||||
|
# Use this to specifically test this API functionality.
|
||||||
|
|
||||||
|
# pytest -sv --stack-config="inference=ollama" \
|
||||||
|
# tests/integration/inference/test_openai_completion.py \
|
||||||
|
# --text-model qwen2.5-coder:1.5b \
|
||||||
|
# -k test_openai_completion_non_streaming_suffix
|
||||||
|
|
||||||
|
if model_id != "qwen2.5-coder:1.5b":
|
||||||
|
pytest.skip(f"Suffix is not supported for the model: {model_id}.")
|
||||||
|
|
||||||
|
provider = provider_from_model(client_with_models, model_id)
|
||||||
|
if provider.provider_type != "remote::ollama":
|
||||||
|
pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
|
||||||
|
|
||||||
|
|
||||||
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
|
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
|
||||||
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
|
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
|
||||||
|
@ -102,6 +116,32 @@ def test_openai_completion_non_streaming(llama_stack_client, client_with_models,
|
||||||
assert len(choice.text) > 10
|
assert len(choice.text) > 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:completion:suffix",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||||
|
skip_if_model_doesnt_support_suffix(client_with_models, text_model_id)
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
|
||||||
|
# ollama needs more verbose prompting for some reason here...
|
||||||
|
response = llama_stack_client.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
prompt=tc["content"],
|
||||||
|
stream=False,
|
||||||
|
suffix=tc["suffix"],
|
||||||
|
max_tokens=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(response.choices) > 0
|
||||||
|
choice = response.choices[0]
|
||||||
|
assert len(choice.text) > 5
|
||||||
|
assert "france" in choice.text.lower()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"test_case",
|
"test_case",
|
||||||
[
|
[
|
||||||
|
|
|
@ -51,7 +51,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
|
||||||
"remote::runpod",
|
"remote::runpod",
|
||||||
"remote::sambanova",
|
"remote::sambanova",
|
||||||
"remote::tgi",
|
"remote::tgi",
|
||||||
"remote::ollama",
|
|
||||||
):
|
):
|
||||||
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")
|
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI embeddings.")
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,12 @@
|
||||||
"content": "Complete the sentence using one word: Roses are red, violets are "
|
"content": "Complete the sentence using one word: Roses are red, violets are "
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"suffix": {
|
||||||
|
"data": {
|
||||||
|
"content": "The capital of ",
|
||||||
|
"suffix": "is Paris."
|
||||||
|
}
|
||||||
|
},
|
||||||
"non_streaming": {
|
"non_streaming": {
|
||||||
"data": {
|
"data": {
|
||||||
"content": "Micheael Jordan is born in ",
|
"content": "Micheael Jordan is born in ",
|
||||||
|
|
|
@ -84,6 +84,28 @@ async def test_query_chunks_full_text_search(sqlite_vec_index, sample_chunks, sa
|
||||||
assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"
|
assert len(response_no_results.chunks) == 0, f"Expected 0 results, but got {len(response_no_results.chunks)}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Create a query embedding that's similar to the first chunk
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 5"
|
||||||
|
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(response.chunks) == 3, f"Expected 3 results, got {len(response.chunks)}"
|
||||||
|
# Verify scores are in descending order (higher is better)
|
||||||
|
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
|
async def test_query_chunks_full_text_search_k_greater_than_results(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
# Re-initialize with a clean index
|
# Re-initialize with a clean index
|
||||||
|
@ -141,3 +163,355 @@ def test_generate_chunk_id():
|
||||||
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
|
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
|
||||||
"f68df25d-d9aa-ab4d-5684-64a233add20d",
|
"f68df25d-d9aa-ab4d-5684-64a233add20d",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_no_keyword_matches(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
"""Test hybrid search when keyword search returns no matches - should still return vector results."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Use a non-existent keyword but a valid vector query
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 499"
|
||||||
|
|
||||||
|
# First verify keyword search returns no results
|
||||||
|
keyword_response = await sqlite_vec_index.query_keyword(query_string, k=5, score_threshold=0.0)
|
||||||
|
assert len(keyword_response.chunks) == 0, "Keyword search should return no results"
|
||||||
|
|
||||||
|
# Get hybrid results
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should still get results from vector search
|
||||||
|
assert len(response.chunks) > 0, "Should get results from vector search even with no keyword matches"
|
||||||
|
# Verify scores are in descending order
|
||||||
|
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_score_threshold(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
"""Test hybrid search with a high score threshold."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Use a very high score threshold that no results will meet
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 5"
|
||||||
|
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=1000.0, # Very high threshold
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should return no results due to high threshold
|
||||||
|
assert len(response.chunks) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_different_embedding(
|
||||||
|
sqlite_vec_index, sample_chunks, sample_embeddings, embedding_dimension
|
||||||
|
):
|
||||||
|
"""Test hybrid search with a different embedding than the stored ones."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Create a random embedding that's different from stored ones
|
||||||
|
query_embedding = np.random.rand(embedding_dimension).astype(np.float32)
|
||||||
|
query_string = "Sentence 5"
|
||||||
|
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should still get results if keyword matches exist
|
||||||
|
assert len(response.chunks) > 0
|
||||||
|
# Verify scores are in descending order
|
||||||
|
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_rrf_ranking(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
"""Test that RRF properly combines rankings when documents appear in both search methods."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Create a query embedding that's similar to the first chunk
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
# Use a keyword that appears in multiple documents
|
||||||
|
query_string = "Sentence 5"
|
||||||
|
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=5,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify we get results from both search methods
|
||||||
|
assert len(response.chunks) > 0
|
||||||
|
# Verify scores are in descending order (RRF should maintain this)
|
||||||
|
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_score_selection(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Create a query embedding that's similar to the first chunk
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
# Use a keyword that appears in the first document
|
||||||
|
query_string = "Sentence 0 from document 0"
|
||||||
|
|
||||||
|
# Test weighted re-ranking
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="weighted",
|
||||||
|
reranker_params={"alpha": 0.5},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 1
|
||||||
|
# Score should be weighted average of normalized keyword score and vector score
|
||||||
|
assert response.scores[0] > 0.5 # Both scores should be high
|
||||||
|
|
||||||
|
# Test RRF re-ranking
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 1
|
||||||
|
# RRF score should be sum of reciprocal ranks
|
||||||
|
assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # 1/(60+1) + 1/(60+1)
|
||||||
|
|
||||||
|
# Test default re-ranking (should be RRF)
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 1
|
||||||
|
assert response.scores[0] == pytest.approx(2.0 / 61.0, rel=1e-6) # Should behave like RRF
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_mixed_results(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
"""Test hybrid search with documents that appear in only one search method."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# Create a query embedding that's similar to the first chunk
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
# Use a keyword that appears in a different document
|
||||||
|
query_string = "Sentence 9 from document 2"
|
||||||
|
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should get results from both search methods
|
||||||
|
assert len(response.chunks) > 0
|
||||||
|
# Verify scores are in descending order
|
||||||
|
assert all(response.scores[i] >= response.scores[i + 1] for i in range(len(response.scores) - 1))
|
||||||
|
# Verify we get results from both the vector-similar document and keyword-matched document
|
||||||
|
doc_ids = {chunk.metadata["document_id"] for chunk in response.chunks}
|
||||||
|
assert "document-0" in doc_ids # From vector search
|
||||||
|
assert "document-2" in doc_ids # From keyword search
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_weighted_reranker_parametrization(
|
||||||
|
sqlite_vec_index, sample_chunks, sample_embeddings
|
||||||
|
):
|
||||||
|
"""Test WeightedReRanker with different alpha values."""
|
||||||
|
# Re-add data before each search to ensure test isolation
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 0 from document 0"
|
||||||
|
|
||||||
|
# alpha=1.0 (should behave like pure keyword)
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="weighted",
|
||||||
|
reranker_params={"alpha": 1.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) > 0 # Should get at least one result
|
||||||
|
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
|
||||||
|
|
||||||
|
# alpha=0.0 (should behave like pure vector)
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="weighted",
|
||||||
|
reranker_params={"alpha": 0.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) > 0 # Should get at least one result
|
||||||
|
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
|
||||||
|
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
# alpha=0.7 (should be a mix)
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="weighted",
|
||||||
|
reranker_params={"alpha": 0.7},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) > 0 # Should get at least one result
|
||||||
|
assert any("document-0" in chunk.metadata["document_id"] for chunk in response.chunks)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_rrf_impact_factor(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
"""Test RRFReRanker with different impact factors."""
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 0 from document 0"
|
||||||
|
|
||||||
|
# impact_factor=10
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 10.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 1
|
||||||
|
assert response.scores[0] == pytest.approx(2.0 / 11.0, rel=1e-6)
|
||||||
|
|
||||||
|
# impact_factor=100
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=1,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 100.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 1
|
||||||
|
assert response.scores[0] == pytest.approx(2.0 / 101.0, rel=1e-6)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_edge_cases(sqlite_vec_index, sample_chunks, sample_embeddings):
|
||||||
|
await sqlite_vec_index.add_chunks(sample_chunks, sample_embeddings)
|
||||||
|
|
||||||
|
# No results from either search - use a completely different embedding and a nonzero threshold
|
||||||
|
query_embedding = np.ones_like(sample_embeddings[0]) * -1 # Very different from sample embeddings
|
||||||
|
query_string = "no_such_keyword_that_will_never_match"
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=0.1, # Nonzero threshold to filter out low-similarity matches
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 0
|
||||||
|
|
||||||
|
# All results below threshold
|
||||||
|
query_embedding = sample_embeddings[0]
|
||||||
|
query_string = "Sentence 0 from document 0"
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=3,
|
||||||
|
score_threshold=1000.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
assert len(response.chunks) == 0
|
||||||
|
|
||||||
|
# Large k value
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=100,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
# Should not error, should return all available results
|
||||||
|
assert len(response.chunks) > 0
|
||||||
|
assert len(response.chunks) <= 100
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_query_chunks_hybrid_tie_breaking(
|
||||||
|
sqlite_vec_index, sample_embeddings, embedding_dimension, tmp_path_factory
|
||||||
|
):
|
||||||
|
"""Test tie-breaking and determinism when scores are equal."""
|
||||||
|
# Create two chunks with the same content and embedding
|
||||||
|
chunk1 = Chunk(content="identical", metadata={"document_id": "docA"})
|
||||||
|
chunk2 = Chunk(content="identical", metadata={"document_id": "docB"})
|
||||||
|
chunks = [chunk1, chunk2]
|
||||||
|
# Use the same embedding for both chunks to ensure equal scores
|
||||||
|
same_embedding = sample_embeddings[0]
|
||||||
|
embeddings = np.array([same_embedding, same_embedding])
|
||||||
|
|
||||||
|
# Clear existing data and recreate index
|
||||||
|
await sqlite_vec_index.delete()
|
||||||
|
temp_dir = tmp_path_factory.getbasetemp()
|
||||||
|
db_path = str(temp_dir / "test_sqlite.db")
|
||||||
|
sqlite_vec_index = await SQLiteVecIndex.create(dimension=embedding_dimension, db_path=db_path, bank_id="test_bank")
|
||||||
|
await sqlite_vec_index.add_chunks(chunks, embeddings)
|
||||||
|
|
||||||
|
# Query with the same embedding and content to ensure equal scores
|
||||||
|
query_embedding = same_embedding
|
||||||
|
query_string = "identical"
|
||||||
|
|
||||||
|
# Run multiple queries to verify determinism
|
||||||
|
responses = []
|
||||||
|
for _ in range(3):
|
||||||
|
response = await sqlite_vec_index.query_hybrid(
|
||||||
|
embedding=query_embedding,
|
||||||
|
query_string=query_string,
|
||||||
|
k=2,
|
||||||
|
score_threshold=0.0,
|
||||||
|
reranker_type="rrf",
|
||||||
|
reranker_params={"impact_factor": 60.0},
|
||||||
|
)
|
||||||
|
responses.append(response)
|
||||||
|
|
||||||
|
# Verify all responses are identical
|
||||||
|
first_response = responses[0]
|
||||||
|
for response in responses[1:]:
|
||||||
|
assert response.chunks == first_response.chunks
|
||||||
|
assert response.scores == first_response.scores
|
||||||
|
|
||||||
|
# Verify both chunks are returned with equal scores
|
||||||
|
assert len(first_response.chunks) == 2
|
||||||
|
assert first_response.scores[0] == first_response.scores[1]
|
||||||
|
assert {chunk.metadata["document_id"] for chunk in first_response.chunks} == {"docA", "docB"}
|
||||||
|
|
Binary file not shown.
|
@ -31,6 +31,25 @@ test_response_web_search:
|
||||||
search_context_size: "low"
|
search_context_size: "low"
|
||||||
output: "128"
|
output: "128"
|
||||||
|
|
||||||
|
test_response_file_search:
|
||||||
|
test_name: test_response_file_search
|
||||||
|
test_params:
|
||||||
|
case:
|
||||||
|
- case_id: "llama_experts"
|
||||||
|
input: "How many experts does the Llama 4 Maverick model have?"
|
||||||
|
tools:
|
||||||
|
- type: file_search
|
||||||
|
# vector_store_ids param for file_search tool gets added by the test runner
|
||||||
|
file_content: "Llama 4 Maverick has 128 experts"
|
||||||
|
output: "128"
|
||||||
|
- case_id: "llama_experts_pdf"
|
||||||
|
input: "How many experts does the Llama 4 Maverick model have?"
|
||||||
|
tools:
|
||||||
|
- type: file_search
|
||||||
|
# vector_store_ids param for file_search toolgets added by the test runner
|
||||||
|
file_path: "pdfs/llama_stack_and_models.pdf"
|
||||||
|
output: "128"
|
||||||
|
|
||||||
test_response_mcp_tool:
|
test_response_mcp_tool:
|
||||||
test_name: test_response_mcp_tool
|
test_name: test_response_mcp_tool
|
||||||
test_params:
|
test_params:
|
||||||
|
|
|
@ -5,6 +5,8 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import openai
|
import openai
|
||||||
|
@ -23,6 +25,31 @@ from tests.verifications.openai_api.fixtures.load import load_test_cases
|
||||||
responses_test_cases = load_test_cases("responses")
|
responses_test_cases = load_test_cases("responses")
|
||||||
|
|
||||||
|
|
||||||
|
def _new_vector_store(openai_client, name):
|
||||||
|
# Ensure we don't reuse an existing vector store
|
||||||
|
vector_stores = openai_client.vector_stores.list()
|
||||||
|
for vector_store in vector_stores:
|
||||||
|
if vector_store.name == name:
|
||||||
|
openai_client.vector_stores.delete(vector_store_id=vector_store.id)
|
||||||
|
|
||||||
|
# Create a new vector store
|
||||||
|
vector_store = openai_client.vector_stores.create(
|
||||||
|
name=name,
|
||||||
|
)
|
||||||
|
return vector_store
|
||||||
|
|
||||||
|
|
||||||
|
def _upload_file(openai_client, name, file_path):
|
||||||
|
# Ensure we don't reuse an existing file
|
||||||
|
files = openai_client.files.list()
|
||||||
|
for file in files:
|
||||||
|
if file.filename == name:
|
||||||
|
openai_client.files.delete(file_id=file.id)
|
||||||
|
|
||||||
|
# Upload a text file with our document content
|
||||||
|
return openai_client.files.create(file=open(file_path, "rb"), purpose="assistants")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"case",
|
"case",
|
||||||
responses_test_cases["test_response_basic"]["test_params"]["case"],
|
responses_test_cases["test_response_basic"]["test_params"]["case"],
|
||||||
|
@ -258,6 +285,111 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
|
||||||
assert case["output"].lower() in response.output_text.lower().strip()
|
assert case["output"].lower() in response.output_text.lower().strip()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
responses_test_cases["test_response_file_search"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_response_non_streaming_file_search(
|
||||||
|
request, openai_client, model, provider, verification_config, tmp_path, case
|
||||||
|
):
|
||||||
|
if isinstance(openai_client, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("Responses API file search is not yet supported in library client.")
|
||||||
|
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
vector_store = _new_vector_store(openai_client, "test_vector_store")
|
||||||
|
|
||||||
|
if "file_content" in case:
|
||||||
|
file_name = "test_response_non_streaming_file_search.txt"
|
||||||
|
file_path = tmp_path / file_name
|
||||||
|
file_path.write_text(case["file_content"])
|
||||||
|
elif "file_path" in case:
|
||||||
|
file_path = os.path.join(os.path.dirname(__file__), "fixtures", case["file_path"])
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"No file content or path provided for case {case['case_id']}")
|
||||||
|
|
||||||
|
file_response = _upload_file(openai_client, file_name, file_path)
|
||||||
|
|
||||||
|
# Attach our file to the vector store
|
||||||
|
file_attach_response = openai_client.vector_stores.files.create(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
file_id=file_response.id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for the file to be attached
|
||||||
|
while file_attach_response.status == "in_progress":
|
||||||
|
time.sleep(0.1)
|
||||||
|
file_attach_response = openai_client.vector_stores.files.retrieve(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
file_id=file_response.id,
|
||||||
|
)
|
||||||
|
assert file_attach_response.status == "completed", f"Expected file to be attached, got {file_attach_response}"
|
||||||
|
assert not file_attach_response.last_error
|
||||||
|
|
||||||
|
# Update our tools with the right vector store id
|
||||||
|
tools = case["tools"]
|
||||||
|
for tool in tools:
|
||||||
|
if tool["type"] == "file_search":
|
||||||
|
tool["vector_store_ids"] = [vector_store.id]
|
||||||
|
|
||||||
|
# Create the response request, which should query our vector store
|
||||||
|
response = openai_client.responses.create(
|
||||||
|
model=model,
|
||||||
|
input=case["input"],
|
||||||
|
tools=tools,
|
||||||
|
stream=False,
|
||||||
|
include=["file_search_call.results"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the file_search_tool was called
|
||||||
|
assert len(response.output) > 1
|
||||||
|
assert response.output[0].type == "file_search_call"
|
||||||
|
assert response.output[0].status == "completed"
|
||||||
|
assert response.output[0].queries # ensure it's some non-empty list
|
||||||
|
assert response.output[0].results
|
||||||
|
assert case["output"].lower() in response.output[0].results[0].text.lower()
|
||||||
|
assert response.output[0].results[0].score > 0
|
||||||
|
|
||||||
|
# Verify the output_text generated by the response
|
||||||
|
assert case["output"].lower() in response.output_text.lower().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def test_response_non_streaming_file_search_empty_vector_store(
|
||||||
|
request, openai_client, model, provider, verification_config
|
||||||
|
):
|
||||||
|
if isinstance(openai_client, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("Responses API file search is not yet supported in library client.")
|
||||||
|
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
vector_store = _new_vector_store(openai_client, "test_vector_store")
|
||||||
|
|
||||||
|
# Create the response request, which should query our vector store
|
||||||
|
response = openai_client.responses.create(
|
||||||
|
model=model,
|
||||||
|
input="How many experts does the Llama 4 Maverick model have?",
|
||||||
|
tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
|
||||||
|
stream=False,
|
||||||
|
include=["file_search_call.results"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the file_search_tool was called
|
||||||
|
assert len(response.output) > 1
|
||||||
|
assert response.output[0].type == "file_search_call"
|
||||||
|
assert response.output[0].status == "completed"
|
||||||
|
assert response.output[0].queries # ensure it's some non-empty list
|
||||||
|
assert not response.output[0].results # ensure we don't get any results
|
||||||
|
|
||||||
|
# Verify some output_text was generated by the response
|
||||||
|
assert response.output_text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"case",
|
"case",
|
||||||
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],
|
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue