feat: Updating files/content response to return additional fields

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
This commit is contained in:
Francisco Javier Arceo 2025-08-06 16:55:14 -04:00
parent e12524af85
commit a19c16428f
143 changed files with 6907 additions and 15104 deletions

View file

@ -9,12 +9,12 @@ import base64
import io
from urllib.parse import unquote
import pandas
from llama_stack.providers.utils.memory.vector_store import parse_data_url
async def get_dataframe_from_uri(uri: str):
import pandas
df = None
if uri.endswith(".csv"):
# Moving to its own thread to avoid io from blocking the eventloop

View file

@ -1,129 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from collections.abc import AsyncIterator
from datetime import UTC, datetime
from typing import Any
from llama_stack.apis.inference import (
OpenAIAssistantMessageParam,
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAIChatCompletionToolCall,
OpenAIChatCompletionToolCallFunction,
OpenAIChoice,
OpenAIChoiceLogprobs,
OpenAIMessageParam,
)
from llama_stack.providers.utils.inference.inference_store import InferenceStore
async def stream_and_store_openai_completion(
provider_stream: AsyncIterator[OpenAIChatCompletionChunk],
model: str,
store: InferenceStore,
input_messages: list[OpenAIMessageParam],
) -> AsyncIterator[OpenAIChatCompletionChunk]:
"""
Wraps a provider's stream, yields chunks, and stores the full completion at the end.
"""
id = None
created = None
choices_data: dict[int, dict[str, Any]] = {}
try:
async for chunk in provider_stream:
if id is None and chunk.id:
id = chunk.id
if created is None and chunk.created:
created = chunk.created
if chunk.choices:
for choice_delta in chunk.choices:
idx = choice_delta.index
if idx not in choices_data:
choices_data[idx] = {
"content_parts": [],
"tool_calls_builder": {},
"finish_reason": None,
"logprobs_content_parts": [],
}
current_choice_data = choices_data[idx]
if choice_delta.delta:
delta = choice_delta.delta
if delta.content:
current_choice_data["content_parts"].append(delta.content)
if delta.tool_calls:
for tool_call_delta in delta.tool_calls:
tc_idx = tool_call_delta.index
if tc_idx not in current_choice_data["tool_calls_builder"]:
# Initialize with correct structure for _ToolCallBuilderData
current_choice_data["tool_calls_builder"][tc_idx] = {
"id": None,
"type": "function",
"function_name_parts": [],
"function_arguments_parts": [],
}
builder = current_choice_data["tool_calls_builder"][tc_idx]
if tool_call_delta.id:
builder["id"] = tool_call_delta.id
if tool_call_delta.type:
builder["type"] = tool_call_delta.type
if tool_call_delta.function:
if tool_call_delta.function.name:
builder["function_name_parts"].append(tool_call_delta.function.name)
if tool_call_delta.function.arguments:
builder["function_arguments_parts"].append(tool_call_delta.function.arguments)
if choice_delta.finish_reason:
current_choice_data["finish_reason"] = choice_delta.finish_reason
if choice_delta.logprobs and choice_delta.logprobs.content:
# Ensure that we are extending with the correct type
current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
yield chunk
finally:
if id:
assembled_choices: list[OpenAIChoice] = []
for choice_idx, choice_data in choices_data.items():
content_str = "".join(choice_data["content_parts"])
assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
if choice_data["tool_calls_builder"]:
for tc_build_data in choice_data["tool_calls_builder"].values():
if tc_build_data["id"]:
func_name = "".join(tc_build_data["function_name_parts"])
func_args = "".join(tc_build_data["function_arguments_parts"])
assembled_tool_calls.append(
OpenAIChatCompletionToolCall(
id=tc_build_data["id"],
type=tc_build_data["type"], # No or "function" needed, already set
function=OpenAIChatCompletionToolCallFunction(name=func_name, arguments=func_args),
)
)
message = OpenAIAssistantMessageParam(
role="assistant",
content=content_str if content_str else None,
tool_calls=assembled_tool_calls if assembled_tool_calls else None,
)
logprobs_content = choice_data["logprobs_content_parts"]
final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
assembled_choices.append(
OpenAIChoice(
finish_reason=choice_data["finish_reason"],
index=choice_idx,
message=message,
logprobs=final_logprobs,
)
)
final_response = OpenAIChatCompletion(
id=id,
choices=assembled_choices,
created=created or int(datetime.now(UTC).timestamp()),
model=model,
object="chat.completion",
)
await store.store_chat_completion(final_response, input_messages)

View file

@ -18,6 +18,7 @@ from llama_stack.apis.files import Files, OpenAIFileObject
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import (
Chunk,
ChunkMetadata,
QueryChunksResponse,
SearchRankingOptions,
VectorStoreChunkingStrategy,
@ -516,31 +517,68 @@ class OpenAIVectorStoreMixin(ABC):
raise ValueError(f"Unsupported filter type: {filter_type}")
def _chunk_to_vector_store_content(self, chunk: Chunk) -> list[VectorStoreContent]:
created_ts = None
if chunk.chunk_metadata is not None:
created_ts = getattr(chunk.chunk_metadata, "created_timestamp", None)
metadata_dict = {}
if chunk.chunk_metadata:
if hasattr(chunk.chunk_metadata, "model_dump"):
metadata_dict = chunk.chunk_metadata.model_dump()
else:
metadata_dict = vars(chunk.chunk_metadata)
user_metadata = chunk.metadata or {}
base_meta = {**metadata_dict, **user_metadata}
# content is InterleavedContent
if isinstance(chunk.content, str):
content = [
VectorStoreContent(
type="text",
text=chunk.content,
embedding=chunk.embedding,
created_timestamp=created_ts,
metadata=user_metadata,
chunk_metadata=ChunkMetadata(**base_meta) if base_meta else None,
)
]
elif isinstance(chunk.content, list):
# TODO: Add support for other types of content
content = [
VectorStoreContent(
type="text",
text=item.text,
)
for item in chunk.content
if item.type == "text"
]
content = []
for item in chunk.content:
if hasattr(item, "type") and item.type == "text":
item_meta = {**base_meta}
item_user_meta = getattr(item, "metadata", {}) or {}
if item_user_meta:
item_meta.update(item_user_meta)
content.append(
VectorStoreContent(
type="text",
text=item.text,
embedding=getattr(item, "embedding", None),
created_timestamp=created_ts,
metadata=item_user_meta,
chunk_metadata=ChunkMetadata(**item_meta) if item_meta else None,
)
)
else:
if chunk.content.type != "text":
raise ValueError(f"Unsupported content type: {chunk.content.type}")
content_item = chunk.content
if content_item.type != "text":
raise ValueError(f"Unsupported content type: {content_item.type}")
item_user_meta = getattr(content_item, "metadata", {}) or {}
combined_meta = {**base_meta, **item_user_meta}
content = [
VectorStoreContent(
type="text",
text=chunk.content.text,
text=content_item.text,
embedding=getattr(content_item, "embedding", None),
created_timestamp=created_ts,
metadata=item_user_meta,
chunk_metadata=ChunkMetadata(**combined_meta) if combined_meta else None,
)
]
return content

View file

@ -302,23 +302,25 @@ class VectorDBWithIndex:
mode = params.get("mode")
score_threshold = params.get("score_threshold", 0.0)
# Get ranker configuration
ranker = params.get("ranker")
if ranker is None:
# Default to RRF with impact_factor=60.0
reranker_type = RERANKER_TYPE_RRF
reranker_params = {"impact_factor": 60.0}
else:
reranker_type = ranker.type
reranker_params = (
{"impact_factor": ranker.impact_factor} if ranker.type == RERANKER_TYPE_RRF else {"alpha": ranker.alpha}
)
strategy = ranker.get("strategy", "rrf")
if strategy == "weighted":
weights = ranker.get("params", {}).get("weights", [0.5, 0.5])
reranker_type = RERANKER_TYPE_WEIGHTED
reranker_params = {"alpha": weights[0] if len(weights) > 0 else 0.5}
else:
reranker_type = RERANKER_TYPE_RRF
k_value = ranker.get("params", {}).get("k", 60.0)
reranker_params = {"impact_factor": k_value}
query_string = interleaved_content_as_str(query)
if mode == "keyword":
return await self.index.query_keyword(query_string, k, score_threshold)
# Calculate embeddings for both vector and hybrid modes
embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
if mode == "hybrid":

View file

@ -9,7 +9,9 @@ import contextvars
import logging
import queue
import random
import sys
import threading
import time
from collections.abc import Callable
from datetime import UTC, datetime
from functools import wraps
@ -30,6 +32,16 @@ from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value
logger = get_logger(__name__, category="core")
# Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion
_fallback_logger = logging.getLogger("llama_stack.telemetry.background")
if not _fallback_logger.handlers:
_fallback_logger.propagate = False
_fallback_logger.setLevel(logging.ERROR)
_fallback_handler = logging.StreamHandler(sys.stderr)
_fallback_handler.setLevel(logging.ERROR)
_fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
_fallback_logger.addHandler(_fallback_handler)
INVALID_SPAN_ID = 0x0000000000000000
INVALID_TRACE_ID = 0x00000000000000000000000000000000
@ -79,19 +91,32 @@ def generate_trace_id() -> str:
CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None)
BACKGROUND_LOGGER = None
LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0
class BackgroundLogger:
def __init__(self, api: Telemetry, capacity: int = 1000):
def __init__(self, api: Telemetry, capacity: int = 100000):
self.api = api
self.log_queue = queue.Queue(maxsize=capacity)
self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
self.worker_thread.start()
self._last_queue_full_log_time: float = 0.0
self._dropped_since_last_notice: int = 0
def log_event(self, event):
try:
self.log_queue.put_nowait(event)
except queue.Full:
logger.error("Log queue is full, dropping event")
# Aggregate drops and emit at most once per interval via fallback logger
self._dropped_since_last_notice += 1
current_time = time.time()
if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS:
_fallback_logger.error(
"Log queue is full; dropped %d events since last notice",
self._dropped_since_last_notice,
)
self._last_queue_full_log_time = current_time
self._dropped_since_last_notice = 0
def _process_logs(self):
while True: