Merge branch 'main' into suffic

This commit is contained in:
Hardik Shah 2025-06-13 16:03:50 -07:00 committed by GitHub
commit 2edb9eb7e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 2105 additions and 63 deletions

View file

@ -4,8 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
import struct
from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any
@ -37,7 +35,6 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
@ -48,6 +45,7 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
b64_encode_openai_embeddings_response,
convert_message_to_openai_dict_new,
convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream,
@ -293,16 +291,7 @@ class LiteLLMOpenAIMixin(
)
# Convert response to OpenAI format
data = []
for i, embedding_data in enumerate(response["data"]):
# we encode to base64 if the encoding format is base64 in the request
if encoding_format == "base64":
byte_data = b"".join(struct.pack("f", f) for f in embedding_data["embedding"])
embedding = base64.b64encode(byte_data).decode("utf-8")
else:
embedding = embedding_data["embedding"]
data.append(OpenAIEmbeddingData(embedding=embedding, index=i))
data = b64_encode_openai_embeddings_response(response.data, encoding_format)
usage = OpenAIEmbeddingUsage(
prompt_tokens=response["usage"]["prompt_tokens"],

View file

@ -3,8 +3,10 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
import json
import logging
import struct
import time
import uuid
import warnings
@ -108,6 +110,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletion,
OpenAICompletion,
OpenAICompletionChoice,
OpenAIEmbeddingData,
OpenAIMessageParam,
OpenAIResponseFormatParam,
ToolConfig,
@ -1484,3 +1487,55 @@ class OpenAIChatCompletionToLlamaStackMixin:
model=model,
object="chat.completion",
)
def prepare_openai_embeddings_params(
model: str,
input: str | list[str],
encoding_format: str | None = "float",
dimensions: int | None = None,
user: str | None = None,
):
if model is None:
raise ValueError("Model must be provided for embeddings")
input_list = [input] if isinstance(input, str) else input
params: dict[str, Any] = {
"model": model,
"input": input_list,
}
if encoding_format is not None:
params["encoding_format"] = encoding_format
if dimensions is not None:
params["dimensions"] = dimensions
if user is not None:
params["user"] = user
return params
def b64_encode_openai_embeddings_response(
response_data: dict, encoding_format: str | None = "float"
) -> list[OpenAIEmbeddingData]:
"""
Process the OpenAI embeddings response to encode the embeddings in base64 format if specified.
"""
data = []
for i, embedding_data in enumerate(response_data):
if encoding_format == "base64":
byte_array = bytearray()
for embedding_value in embedding_data.embedding:
byte_array.extend(struct.pack("f", float(embedding_value)))
response_embedding = base64.b64encode(byte_array).decode("utf-8")
else:
response_embedding = embedding_data.embedding
data.append(
OpenAIEmbeddingData(
embedding=response_embedding,
index=i,
)
)
return data

View file

@ -5,11 +5,13 @@
# the root directory of this source tree.
import logging
import mimetypes
import time
import uuid
from abc import ABC, abstractmethod
from typing import Any
from llama_stack.apis.files import Files
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import (
QueryChunksResponse,
@ -20,6 +22,15 @@ from llama_stack.apis.vector_io import (
VectorStoreSearchResponse,
VectorStoreSearchResponsePage,
)
from llama_stack.apis.vector_io.vector_io import (
Chunk,
VectorStoreChunkingStrategy,
VectorStoreChunkingStrategyAuto,
VectorStoreChunkingStrategyStatic,
VectorStoreFileLastError,
VectorStoreFileObject,
)
from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks
logger = logging.getLogger(__name__)
@ -36,6 +47,7 @@ class OpenAIVectorStoreMixin(ABC):
# These should be provided by the implementing class
openai_vector_stores: dict[str, dict[str, Any]]
files_api: Files | None
@abstractmethod
async def _save_openai_vector_store(self, store_id: str, store_info: dict[str, Any]) -> None:
@ -67,6 +79,16 @@ class OpenAIVectorStoreMixin(ABC):
"""Unregister a vector database (provider-specific implementation)."""
pass
@abstractmethod
async def insert_chunks(
self,
vector_db_id: str,
chunks: list[Chunk],
ttl_seconds: int | None = None,
) -> None:
"""Insert chunks into a vector database (provider-specific implementation)."""
pass
@abstractmethod
async def query_chunks(
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None
@ -383,3 +405,78 @@ class OpenAIVectorStoreMixin(ABC):
if metadata[key] != value:
return False
return True
async def openai_attach_file_to_vector_store(
self,
vector_store_id: str,
file_id: str,
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject:
attributes = attributes or {}
chunking_strategy = chunking_strategy or VectorStoreChunkingStrategyAuto()
vector_store_file_object = VectorStoreFileObject(
id=file_id,
attributes=attributes,
chunking_strategy=chunking_strategy,
created_at=int(time.time()),
status="in_progress",
vector_store_id=vector_store_id,
)
if not hasattr(self, "files_api") or not self.files_api:
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message="Files API is not available",
)
return vector_store_file_object
if isinstance(chunking_strategy, VectorStoreChunkingStrategyStatic):
max_chunk_size_tokens = chunking_strategy.static.max_chunk_size_tokens
chunk_overlap_tokens = chunking_strategy.static.chunk_overlap_tokens
else:
# Default values from OpenAI API spec
max_chunk_size_tokens = 800
chunk_overlap_tokens = 400
try:
file_response = await self.files_api.openai_retrieve_file(file_id)
mime_type, _ = mimetypes.guess_type(file_response.filename)
content_response = await self.files_api.openai_retrieve_file_content(file_id)
content = content_from_data_and_mime_type(content_response.body, mime_type)
chunks = make_overlapped_chunks(
file_id,
content,
max_chunk_size_tokens,
chunk_overlap_tokens,
attributes,
)
if not chunks:
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message="No chunks were generated from the file",
)
return vector_store_file_object
await self.insert_chunks(
vector_db_id=vector_store_id,
chunks=chunks,
)
except Exception as e:
logger.error(f"Error attaching file to vector store: {e}")
vector_store_file_object.status = "failed"
vector_store_file_object.last_error = VectorStoreFileLastError(
code="server_error",
message=str(e),
)
return vector_store_file_object
vector_store_file_object.status = "completed"
return vector_store_file_object

View file

@ -32,6 +32,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
log = logging.getLogger(__name__)
# Constants for reranker types
RERANKER_TYPE_RRF = "rrf"
RERANKER_TYPE_WEIGHTED = "weighted"
def parse_pdf(data: bytes) -> str:
# For PDF and DOC/DOCX files, we can't reliably convert to string
@ -72,16 +76,18 @@ def content_from_data(data_url: str) -> str:
data = unquote(data)
encoding = parts["encoding"] or "utf-8"
data = data.encode(encoding)
return content_from_data_and_mime_type(data, parts["mimetype"], parts.get("encoding", None))
encoding = parts["encoding"]
if not encoding:
import chardet
detected = chardet.detect(data)
encoding = detected["encoding"]
def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, encoding: str | None = None) -> str:
if isinstance(data, bytes):
if not encoding:
import chardet
mime_type = parts["mimetype"]
mime_category = mime_type.split("/")[0]
detected = chardet.detect(data)
encoding = detected["encoding"]
mime_category = mime_type.split("/")[0] if mime_type else None
if mime_category == "text":
# For text-based files (including CSV, MD)
return data.decode(encoding)
@ -200,6 +206,18 @@ class EmbeddingIndex(ABC):
async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
raise NotImplementedError()
@abstractmethod
async def query_hybrid(
self,
embedding: NDArray,
query_string: str,
k: int,
score_threshold: float,
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
raise NotImplementedError()
@abstractmethod
async def delete(self):
raise NotImplementedError()
@ -243,10 +261,29 @@ class VectorDBWithIndex:
k = params.get("max_chunks", 3)
mode = params.get("mode")
score_threshold = params.get("score_threshold", 0.0)
# Get ranker configuration
ranker = params.get("ranker")
if ranker is None:
# Default to RRF with impact_factor=60.0
reranker_type = RERANKER_TYPE_RRF
reranker_params = {"impact_factor": 60.0}
else:
reranker_type = ranker.type
reranker_params = (
{"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
)
query_string = interleaved_content_as_str(query)
if mode == "keyword":
return await self.index.query_keyword(query_string, k, score_threshold)
# Calculate embeddings for both vector and hybrid modes
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
if mode == "hybrid":
return await self.index.query_hybrid(
query_vector, query_string, k, score_threshold, reranker_type, reranker_params
)
else:
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
return await self.index.query_vector(query_vector, k, score_threshold)