diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml index 448fdbbfe..905d6b73a 100644 --- a/.github/actions/setup-runner/action.yml +++ b/.github/actions/setup-runner/action.yml @@ -29,8 +29,8 @@ runs: # Install llama-stack-client-python based on the client-version input if [ "${{ inputs.client-version }}" = "latest" ]; then - echo "Installing latest llama-stack-client-python from next branch" - uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@next + echo "Installing latest llama-stack-client-python from main branch" + uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main elif [ "${{ inputs.client-version }}" = "published" ]; then echo "Installing published llama-stack-client-python from PyPI" uv pip install llama-stack-client diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index ececca0f6..478e8f598 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -44,8 +44,8 @@ runs: run: | # Install llama-stack-client-python based on the client-version input if [ "${{ inputs.client-version }}" = "latest" ]; then - echo "Installing latest llama-stack-client-python from next branch" - export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@next + echo "Installing latest llama-stack-client-python from main branch" + export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main elif [ "${{ inputs.client-version }}" = "published" ]; then echo "Installing published llama-stack-client-python from PyPI" unset LLAMA_STACK_CLIENT_DIR diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 9e28e0f42..32ead1764 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -1035,50 +1035,6 @@ ] } }, - "/v1/inference/embeddings": { - "post": { - "responses": { - "200": { - "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Inference" - ], - "summary": "Generate embeddings for content pieces using the specified model.", - "description": "Generate embeddings for content pieces using the specified model.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EmbeddingsRequest" - } - } - }, - "required": true - } - } - }, "/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": { "post": { "responses": { @@ -5475,7 +5431,7 @@ } } }, - "/v1/inference/rerank": { + "/v1alpha/inference/rerank": { "post": { "responses": { "200": { @@ -10547,80 +10503,6 @@ "title": "OpenAIDeleteResponseObject", "description": "Response object confirming deletion of an OpenAI response." }, - "EmbeddingsRequest": { - "type": "object", - "properties": { - "model_id": { - "type": "string", - "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." - }, - "contents": { - "oneOf": [ - { - "type": "array", - "items": { - "type": "string" - } - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContentItem" - } - } - ], - "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text." - }, - "text_truncation": { - "type": "string", - "enum": [ - "none", - "start", - "end" - ], - "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length." - }, - "output_dimension": { - "type": "integer", - "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models." - }, - "task_type": { - "type": "string", - "enum": [ - "query", - "document" - ], - "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models." - } - }, - "additionalProperties": false, - "required": [ - "model_id", - "contents" - ], - "title": "EmbeddingsRequest" - }, - "EmbeddingsResponse": { - "type": "object", - "properties": { - "embeddings": { - "type": "array", - "items": { - "type": "array", - "items": { - "type": "number" - } - }, - "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}" - } - }, - "additionalProperties": false, - "required": [ - "embeddings" - ], - "title": "EmbeddingsResponse", - "description": "Response containing generated embeddings." - }, "AgentCandidate": { "type": "object", "properties": { diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 1c06c74a5..3b5b92060 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -720,41 +720,6 @@ paths: required: true schema: type: string - /v1/inference/embeddings: - post: - responses: - '200': - description: >- - An array of embeddings, one for each content. Each embedding is a list - of floats. The dimensionality of the embedding is model-specific; you - can check model metadata using /models/{model_id}. - content: - application/json: - schema: - $ref: '#/components/schemas/EmbeddingsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Inference - summary: >- - Generate embeddings for content pieces using the specified model. - description: >- - Generate embeddings for content pieces using the specified model. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EmbeddingsRequest' - required: true /v1alpha/eval/benchmarks/{benchmark_id}/evaluations: post: responses: @@ -3930,7 +3895,7 @@ paths: schema: $ref: '#/components/schemas/QueryTracesRequest' required: true - /v1/inference/rerank: + /v1alpha/inference/rerank: post: responses: '200': @@ -7795,72 +7760,6 @@ components: title: OpenAIDeleteResponseObject description: >- Response object confirming deletion of an OpenAI response. - EmbeddingsRequest: - type: object - properties: - model_id: - type: string - description: >- - The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - contents: - oneOf: - - type: array - items: - type: string - - type: array - items: - $ref: '#/components/schemas/InterleavedContentItem' - description: >- - List of contents to generate embeddings for. Each content can be a string - or an InterleavedContentItem (and hence can be multimodal). The behavior - depends on the model and provider. Some models may only support text. - text_truncation: - type: string - enum: - - none - - start - - end - description: >- - (Optional) Config for how to truncate text for embedding when text is - longer than the model's max sequence length. - output_dimension: - type: integer - description: >- - (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - task_type: - type: string - enum: - - query - - document - description: >- - (Optional) How is the embedding being used? This is only supported by - asymmetric embedding models. - additionalProperties: false - required: - - model_id - - contents - title: EmbeddingsRequest - EmbeddingsResponse: - type: object - properties: - embeddings: - type: array - items: - type: array - items: - type: number - description: >- - List of embedding vectors, one per input content. Each embedding is a - list of floats. The dimensionality of the embedding is model-specific; - you can check model metadata using /models/{model_id} - additionalProperties: false - required: - - embeddings - title: EmbeddingsResponse - description: >- - Response containing generated embeddings. AgentCandidate: type: object properties: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 756896796..134da5bf8 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -17,11 +17,11 @@ from typing import ( from pydantic import BaseModel, Field, field_validator from typing_extensions import TypedDict -from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem +from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.responses import Order from llama_stack.apis.models import Model from llama_stack.apis.telemetry import MetricResponseMixin -from llama_stack.apis.version import LLAMA_STACK_API_V1 +from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA from llama_stack.models.llama.datatypes import ( BuiltinTool, StopReason, @@ -1070,27 +1070,7 @@ class InferenceProvider(Protocol): """ ... - @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - """Generate embeddings for content pieces using the specified model. - - :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. - :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text. - :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models. - :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length. - :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models. - :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}. - """ - ... - - @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1) + @webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA) async def rerank( self, model: str, diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py index b5558c66f..6a297f012 100644 --- a/llama_stack/core/datatypes.py +++ b/llama_stack/core/datatypes.py @@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel): num_writers: int = Field(default=4, description="Number of concurrent background writers") +class ResponsesStoreConfig(BaseModel): + sql_store_config: SqlStoreConfig + max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store") + num_writers: int = Field(default=4, description="Number of concurrent background writers") + + class StackRunConfig(BaseModel): version: int = LLAMA_STACK_RUN_CONFIG_VERSION diff --git a/llama_stack/core/resolver.py b/llama_stack/core/resolver.py index 373446de6..f421c47ed 100644 --- a/llama_stack/core/resolver.py +++ b/llama_stack/core/resolver.py @@ -29,6 +29,7 @@ from llama_stack.apis.telemetry import Telemetry from llama_stack.apis.tools import ToolGroups, ToolRuntime from llama_stack.apis.vector_dbs import VectorDBs from llama_stack.apis.vector_io import VectorIO +from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA from llama_stack.core.client import get_client_impl from llama_stack.core.datatypes import ( AccessRule, @@ -412,8 +413,14 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None: mro = type(obj).__mro__ for name, value in inspect.getmembers(protocol): - if inspect.isfunction(value) and hasattr(value, "__webmethod__"): - if value.__webmethod__.experimental: + if inspect.isfunction(value) and hasattr(value, "__webmethods__"): + has_alpha_api = False + for webmethod in value.__webmethods__: + if webmethod.level == LLAMA_STACK_API_V1ALPHA: + has_alpha_api = True + break + # if this API has multiple webmethods, and one of them is an alpha API, this API should be skipped when checking for missing or not callable routes + if has_alpha_api: continue if not hasattr(obj, name): missing_methods.append((name, "missing")) diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index fcf01a9c4..80f47fb5d 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -16,7 +16,6 @@ from pydantic import Field, TypeAdapter from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError from llama_stack.apis.inference import ( @@ -26,8 +25,6 @@ from llama_stack.apis.inference import ( CompletionMessage, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, ListOpenAIChatCompletionResponse, LogProbConfig, @@ -48,7 +45,6 @@ from llama_stack.apis.inference import ( ResponseFormat, SamplingParams, StopReason, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -312,25 +308,6 @@ class InferenceRouter(Inference): return response - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - logger.debug(f"InferenceRouter.embeddings: {model_id}") - await self._get_model(model_id, ModelType.embedding) - provider = await self.routing_table.get_provider_impl(model_id) - return await provider.embeddings( - model_id=model_id, - contents=contents, - text_truncation=text_truncation, - output_dimension=output_dimension, - task_type=task_type, - ) - async def openai_completion( self, model: str, diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index dcc08a482..467777b72 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -924,7 +924,7 @@ async def get_raw_document_text(document: Document) -> str: DeprecationWarning, stacklevel=2, ) - elif not (document.mime_type.startswith("text/") or document.mime_type == "application/yaml"): + elif not (document.mime_type.startswith("text/") or document.mime_type in ("application/yaml", "application/json")): raise ValueError(f"Unexpected document mime type: {document.mime_type}") if isinstance(document.content, URL): diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 29b935bbd..2206aa641 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -11,21 +11,17 @@ from botocore.client import BaseClient from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -47,8 +43,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( ) from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, ) from .models import MODEL_ENTRIES @@ -218,36 +212,6 @@ class BedrockInferenceAdapter( ), } - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - # Convert foundation model ID to inference profile ID - region_name = self.client.meta.region_name - inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name) - - embeddings = [] - for content in contents: - assert not content_has_media(content), "Bedrock does not support media for embeddings" - input_text = interleaved_content_as_str(content) - input_body = {"inputText": input_text} - body = json.dumps(input_body) - response = self.client.invoke_model( - body=body, - modelId=inference_profile_id, - accept="application/json", - contentType="application/json", - ) - response_body = json.loads(response.get("body").read()) - embeddings.append(response_body.get("embedding")) - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index 6662f004d..6be39fa5d 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -11,21 +11,17 @@ from cerebras.cloud.sdk import AsyncCerebras from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, CompletionRequest, CompletionResponse, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, OpenAIEmbeddingsResponse, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -187,16 +183,6 @@ class CerebrasInferenceAdapter( **get_sampling_options(request.sampling_params), } - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 6eac6e4f4..d85b477f5 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -11,15 +11,12 @@ from databricks.sdk import WorkspaceClient from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseStreamChunk, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( OpenAICompletion, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -118,16 +114,6 @@ class DatabricksInferenceAdapter( ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]: raise NotImplementedError() - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def list_models(self) -> list[Model] | None: self._model_cache = {} # from OpenAIMixin ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key()) # TODO: this is not async diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 069a0a674..ed4b56fad 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -10,22 +10,18 @@ from fireworks.client import Fireworks from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, CompletionResponse, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -48,8 +44,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -259,28 +253,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee logger.debug(f"params to fireworks: {params}") return params - - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - kwargs = {} - if model.metadata.get("embedding_dimension"): - kwargs["dimensions"] = model.metadata.get("embedding_dimension") - assert all(not content_has_media(content) for content in contents), ( - "Fireworks does not support media for embeddings" - ) - response = self._get_client().embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - **kwargs, - ) - - embeddings = [data.embedding for data in response.data] - return EmbeddingsResponse(embeddings=embeddings) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 92094a0f3..a31981adb 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -11,8 +11,6 @@ from openai import NOT_GIVEN, APIConnectionError from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, - TextContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, @@ -21,8 +19,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -31,7 +27,6 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingUsage, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ) @@ -156,60 +151,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference): # we pass n=1 to get only one completion return convert_openai_completion_choice(response.choices[0]) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - if any(content_has_media(content) for content in contents): - raise NotImplementedError("Media is not supported") - - # - # Llama Stack: contents = list[str] | list[InterleavedContentItem] - # -> - # OpenAI: input = str | list[str] - # - # we can ignore str and always pass list[str] to OpenAI - # - flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents] - input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents] - provider_model_id = await self._get_provider_model_id(model_id) - - extra_body = {} - - if text_truncation is not None: - text_truncation_options = { - TextTruncation.none: "NONE", - TextTruncation.end: "END", - TextTruncation.start: "START", - } - extra_body["truncate"] = text_truncation_options[text_truncation] - - if output_dimension is not None: - extra_body["dimensions"] = output_dimension - - if task_type is not None: - task_type_options = { - EmbeddingTaskType.document: "passage", - EmbeddingTaskType.query: "query", - } - extra_body["input_type"] = task_type_options[task_type] - - response = await self.client.embeddings.create( - model=provider_model_id, - input=input, - extra_body=extra_body, - ) - # - # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...) - # -> - # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]]) - # - return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data]) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 3fb10445f..16b104fb5 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -14,7 +14,6 @@ from ollama import AsyncClient as AsyncOllamaClient from llama_stack.apis.common.content_types import ( ImageContentItem, InterleavedContent, - InterleavedContentItem, TextContentItem, ) from llama_stack.apis.common.errors import UnsupportedModelError @@ -25,8 +24,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, GrammarResponseFormat, InferenceProvider, JsonSchemaResponseFormat, @@ -34,7 +31,6 @@ from llama_stack.apis.inference import ( Message, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -66,9 +62,7 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, convert_image_content_to_url, - interleaved_content_as_str, request_has_media, ) @@ -363,27 +357,6 @@ class OllamaInferenceAdapter( async for chunk in process_chat_completion_stream_response(stream, request): yield chunk - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self._get_model(model_id) - - assert all(not content_has_media(content) for content in contents), ( - "Ollama does not support media for embeddings" - ) - response = await self.ollama_client.embed( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - ) - embeddings = response["embeddings"] - - return EmbeddingsResponse(embeddings=embeddings) - async def register_model(self, model: Model) -> Model: if await self.check_model_availability(model.provider_model_id): return model diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index a2bdf0369..ae482b7b0 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -14,8 +14,6 @@ from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseStreamChunk, CompletionMessage, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +25,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -190,25 +187,6 @@ class PassthroughInferenceAdapter(Inference): chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk) yield chunk - async def embeddings( - self, - model_id: str, - contents: list[InterleavedContent], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - client = self._get_client() - model = await self.model_store.get_model(model_id) - - return await client.inference.embeddings( - model_id=model.provider_resource_id, - contents=contents, - text_truncation=text_truncation, - output_dimension=output_dimension, - task_type=task_type, - ) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index ff2fe6401..82252b04d 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -136,16 +136,6 @@ class RunpodInferenceAdapter( **get_sampling_options(request.sampling_params), } - async def embeddings( - self, - model: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 27597900f..e1632e4a0 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -12,14 +12,11 @@ from pydantic import SecretStr from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -306,16 +302,6 @@ class _HfAdapter( **self._build_options(request.sampling_params, request.response_format), ) - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError() - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index c199677be..083c528bb 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -12,14 +12,11 @@ from together.constants import BASE_URL from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, Inference, LogProbConfig, Message, @@ -27,7 +24,6 @@ from llama_stack.apis.inference import ( ResponseFormat, ResponseFormatType, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -50,8 +46,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_prompt, completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -247,26 +241,6 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need logger.debug(f"params to together: {params}") return params - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - assert all(not content_has_media(content) for content in contents), ( - "Together does not support media for embeddings" - ) - client = self._get_client() - r = await client.embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - ) - embeddings = [item.embedding for item in r.data] - return EmbeddingsResponse(embeddings=embeddings) - async def list_models(self) -> list[Model] | None: self._model_cache = {} # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 8fbb4b815..bef5cbf2c 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -16,7 +16,6 @@ from openai.types.chat.chat_completion_chunk import ( from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, TextDelta, ToolCallDelta, ToolCallParseStatus, @@ -31,8 +30,6 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, GrammarResponseFormat, Inference, JsonSchemaResponseFormat, @@ -41,7 +38,6 @@ from llama_stack.apis.inference import ( ModelStore, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -74,8 +70,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( completion_request_to_prompt, - content_has_media, - interleaved_content_as_str, request_has_media, ) @@ -550,27 +544,3 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro "stream": request.stream, **options, } - - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self._get_model(model_id) - - kwargs = {} - assert model.model_type == ModelType.embedding - assert model.metadata.get("embedding_dimension") - kwargs["dimensions"] = model.metadata.get("embedding_dimension") - assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings" - response = await self.client.embeddings.create( - model=model.provider_resource_id, - input=[interleaved_content_as_str(content) for content in contents], - **kwargs, - ) - - embeddings = [data.embedding for data in response.data] - return EmbeddingsResponse(embeddings=embeddings) diff --git a/llama_stack/providers/remote/inference/watsonx/watsonx.py b/llama_stack/providers/remote/inference/watsonx/watsonx.py index cb8b45565..00b9acc06 100644 --- a/llama_stack/providers/remote/inference/watsonx/watsonx.py +++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py @@ -11,13 +11,11 @@ from ibm_watsonx_ai.foundation_models import Model from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams from openai import AsyncOpenAI -from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem +from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingsResponse, - EmbeddingTaskType, GreedySamplingStrategy, Inference, LogProbConfig, @@ -30,7 +28,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -265,16 +262,6 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper): } return params - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - raise NotImplementedError("embedding is not supported for watsonx") - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index 9bd0aa8ce..facc59f65 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -15,16 +15,11 @@ if TYPE_CHECKING: from sentence_transformers import SentenceTransformer from llama_stack.apis.inference import ( - EmbeddingsResponse, - EmbeddingTaskType, - InterleavedContentItem, ModelStore, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, - TextTruncation, ) -from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str EMBEDDING_MODELS = {} @@ -35,23 +30,6 @@ log = get_logger(name=__name__, category="providers::utils") class SentenceTransformerEmbeddingMixin: model_store: ModelStore - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id) - embeddings = await asyncio.to_thread( - embedding_model.encode, - [interleaved_content_as_str(content) for content in contents], - show_progress_bar=False, - ) - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index b1e38f323..966081e9f 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -11,14 +11,11 @@ import litellm from llama_stack.apis.common.content_types import ( InterleavedContent, - InterleavedContentItem, ) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseStreamChunk, - EmbeddingsResponse, - EmbeddingTaskType, InferenceProvider, JsonSchemaResponseFormat, LogProbConfig, @@ -32,7 +29,6 @@ from llama_stack.apis.inference import ( OpenAIResponseFormatParam, ResponseFormat, SamplingParams, - TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -50,9 +46,6 @@ from llama_stack.providers.utils.inference.openai_compat import ( get_sampling_options, prepare_openai_completion_params, ) -from llama_stack.providers.utils.inference.prompt_adapter import ( - interleaved_content_as_str, -) logger = get_logger(name=__name__, category="providers::utils") @@ -269,24 +262,6 @@ class LiteLLMOpenAIMixin( ) return api_key - async def embeddings( - self, - model_id: str, - contents: list[str] | list[InterleavedContentItem], - text_truncation: TextTruncation | None = TextTruncation.none, - output_dimension: int | None = None, - task_type: EmbeddingTaskType | None = None, - ) -> EmbeddingsResponse: - model = await self.model_store.get_model(model_id) - - response = litellm.embedding( - model=self.get_litellm_model_name(model.provider_resource_id), - input=[interleaved_content_as_str(content) for content in contents], - ) - - embeddings = [data["embedding"] for data in response["data"]] - return EmbeddingsResponse(embeddings=embeddings) - async def openai_embeddings( self, model: str, diff --git a/llama_stack/providers/utils/responses/responses_store.py b/llama_stack/providers/utils/responses/responses_store.py index 829cd8a62..b9fceb1ab 100644 --- a/llama_stack/providers/utils/responses/responses_store.py +++ b/llama_stack/providers/utils/responses/responses_store.py @@ -3,6 +3,9 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import asyncio +from typing import Any + from llama_stack.apis.agents import ( Order, ) @@ -14,24 +17,51 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseObject, OpenAIResponseObjectWithInput, ) -from llama_stack.core.datatypes import AccessRule +from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR +from llama_stack.log import get_logger from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore -from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl +from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl + +logger = get_logger(name=__name__, category="responses_store") class ResponsesStore: - def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]): - if not sql_store_config: - sql_store_config = SqliteSqlStoreConfig( + def __init__( + self, + config: ResponsesStoreConfig | SqlStoreConfig, + policy: list[AccessRule], + ): + # Handle backward compatibility + if not isinstance(config, ResponsesStoreConfig): + # Legacy: SqlStoreConfig passed directly as config + config = ResponsesStoreConfig( + sql_store_config=config, + ) + + self.config = config + self.sql_store_config = config.sql_store_config + if not self.sql_store_config: + self.sql_store_config = SqliteSqlStoreConfig( db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(), ) - self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy) + self.sql_store = None + self.policy = policy + + # Disable write queue for SQLite to avoid concurrency issues + self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite + + # Async write queue and worker control + self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None + self._worker_tasks: list[asyncio.Task[Any]] = [] + self._max_write_queue_size: int = config.max_write_queue_size + self._num_writers: int = max(1, config.num_writers) async def initialize(self): """Create the necessary tables if they don't exist.""" + self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy) await self.sql_store.create_table( "openai_responses", { @@ -42,9 +72,68 @@ class ResponsesStore: }, ) + if self.enable_write_queue: + self._queue = asyncio.Queue(maxsize=self._max_write_queue_size) + for _ in range(self._num_writers): + self._worker_tasks.append(asyncio.create_task(self._worker_loop())) + else: + logger.info("Write queue disabled for SQLite to avoid concurrency issues") + + async def shutdown(self) -> None: + if not self._worker_tasks: + return + if self._queue is not None: + await self._queue.join() + for t in self._worker_tasks: + if not t.done(): + t.cancel() + for t in self._worker_tasks: + try: + await t + except asyncio.CancelledError: + pass + self._worker_tasks.clear() + + async def flush(self) -> None: + """Wait for all queued writes to complete. Useful for testing.""" + if self.enable_write_queue and self._queue is not None: + await self._queue.join() + async def store_response_object( self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput] ) -> None: + if self.enable_write_queue: + if self._queue is None: + raise ValueError("Responses store is not initialized") + try: + self._queue.put_nowait((response_object, input)) + except asyncio.QueueFull: + logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '')}") + await self._queue.put((response_object, input)) + else: + await self._write_response_object(response_object, input) + + async def _worker_loop(self) -> None: + assert self._queue is not None + while True: + try: + item = await self._queue.get() + except asyncio.CancelledError: + break + response_object, input = item + try: + await self._write_response_object(response_object, input) + except Exception as e: # noqa: BLE001 + logger.error(f"Error writing response object: {e}") + finally: + self._queue.task_done() + + async def _write_response_object( + self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput] + ) -> None: + if self.sql_store is None: + raise ValueError("Responses store is not initialized") + data = response_object.model_dump() data["input"] = [input_item.model_dump() for input_item in input] @@ -73,6 +162,9 @@ class ResponsesStore: :param model: The model to filter by. :param order: The order to sort the responses by. """ + if not self.sql_store: + raise ValueError("Responses store is not initialized") + if not order: order = Order.desc @@ -100,6 +192,9 @@ class ResponsesStore: """ Get a response object with automatic access control checking. """ + if not self.sql_store: + raise ValueError("Responses store is not initialized") + row = await self.sql_store.fetch_one( "openai_responses", where={"id": response_id}, @@ -113,6 +208,9 @@ class ResponsesStore: return OpenAIResponseObjectWithInput(**row["response_object"]) async def delete_response_object(self, response_id: str) -> OpenAIDeleteResponseObject: + if not self.sql_store: + raise ValueError("Responses store is not initialized") + row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id}) if not row: raise ValueError(f"Response with id {response_id} not found") diff --git a/llama_stack/schema_utils.py b/llama_stack/schema_utils.py index 4f8b4edff..c58fcdd01 100644 --- a/llama_stack/schema_utils.py +++ b/llama_stack/schema_utils.py @@ -22,7 +22,6 @@ class WebMethod: raw_bytes_request_body: bool | None = False # A descriptive name of the corresponding span created by tracing descriptive_name: str | None = None - experimental: bool | None = False required_scope: str | None = None deprecated: bool | None = False @@ -39,7 +38,6 @@ def webmethod( response_examples: list[Any] | None = None, raw_bytes_request_body: bool | None = False, descriptive_name: str | None = None, - experimental: bool | None = False, required_scope: str | None = None, deprecated: bool | None = False, ) -> Callable[[T], T]: @@ -50,7 +48,6 @@ def webmethod( :param public: True if the operation can be invoked without prior authentication. :param request_examples: Sample requests that the operation might take. Pass a list of objects, not JSON. :param response_examples: Sample responses that the operation might produce. Pass a list of objects, not JSON. - :param experimental: True if the operation is experimental and subject to change. :param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer'). """ @@ -64,7 +61,6 @@ def webmethod( response_examples=response_examples, raw_bytes_request_body=raw_bytes_request_body, descriptive_name=descriptive_name, - experimental=experimental, required_scope=required_scope, deprecated=deprecated, ) diff --git a/tests/unit/providers/agent/test_get_raw_document_text.py b/tests/unit/providers/agent/test_get_raw_document_text.py index eb481c0d8..302a893b1 100644 --- a/tests/unit/providers/agent/test_get_raw_document_text.py +++ b/tests/unit/providers/agent/test_get_raw_document_text.py @@ -107,14 +107,34 @@ async def test_get_raw_document_text_deprecated_text_yaml_with_text_content_item assert "text/yaml" in str(w[0].message) +async def test_get_raw_document_text_supports_json_mime_type(): + """Test that the function accepts application/json mime type.""" + json_content = '{"name": "test", "version": "1.0", "items": ["item1", "item2"]}' + + document = Document(content=json_content, mime_type="application/json") + + result = await get_raw_document_text(document) + assert result == json_content + + +async def test_get_raw_document_text_with_json_text_content_item(): + """Test that the function handles JSON TextContentItem correctly.""" + json_content = '{"key": "value", "nested": {"array": [1, 2, 3]}}' + + document = Document(content=TextContentItem(text=json_content), mime_type="application/json") + + result = await get_raw_document_text(document) + assert result == json_content + + async def test_get_raw_document_text_rejects_unsupported_mime_types(): """Test that the function rejects unsupported mime types.""" document = Document( content="Some content", - mime_type="application/json", # Not supported + mime_type="application/pdf", # Not supported ) - with pytest.raises(ValueError, match="Unexpected document mime type: application/json"): + with pytest.raises(ValueError, match="Unexpected document mime type: application/pdf"): await get_raw_document_text(document) diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index a964bc219..38ce365c1 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -42,10 +42,12 @@ from llama_stack.apis.inference import ( ) from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime from llama_stack.core.access_control.access_control import default_policy +from llama_stack.core.datatypes import ResponsesStoreConfig from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import ( OpenAIResponsesImpl, ) from llama_stack.providers.utils.responses.responses_store import ResponsesStore +from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture @@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic(): # Create mock store and response store mock_sql_store = AsyncMock() - responses_store = ResponsesStore(sql_store_config=None, policy=default_policy()) + responses_store = ResponsesStore( + ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy() + ) responses_store.sql_store = mock_sql_store # Setup test data - multiple input items diff --git a/tests/unit/providers/vector_io/test_faiss.py b/tests/unit/providers/vector_io/test_faiss.py index 90108d7a0..9ee5c82f4 100644 --- a/tests/unit/providers/vector_io/test_faiss.py +++ b/tests/unit/providers/vector_io/test_faiss.py @@ -5,13 +5,12 @@ # the root directory of this source tree. import asyncio -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock, patch import numpy as np import pytest from llama_stack.apis.files import Files -from llama_stack.apis.inference import EmbeddingsResponse, Inference from llama_stack.apis.vector_dbs import VectorDB from llama_stack.apis.vector_io import Chunk, QueryChunksResponse from llama_stack.providers.datatypes import HealthStatus @@ -70,13 +69,6 @@ def mock_vector_db(vector_db_id, embedding_dimension) -> MagicMock: return mock_vector_db -@pytest.fixture -def mock_inference_api(sample_embeddings): - mock_api = MagicMock(spec=Inference) - mock_api.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings)) - return mock_api - - @pytest.fixture def mock_files_api(): mock_api = MagicMock(spec=Files) @@ -96,22 +88,6 @@ async def faiss_index(embedding_dimension): yield index -@pytest.fixture -async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter: - # Create the adapter - adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api) - - # Create a mock KVStore - mock_kvstore = MagicMock() - mock_kvstore.values_in_range = AsyncMock(return_value=[]) - - # Patch the initialize method to avoid the kvstore_impl call - with patch.object(FaissVectorIOAdapter, "initialize"): - # Set the kvstore directly - adapter.kvstore = mock_kvstore - yield adapter - - async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical( faiss_index, sample_chunks, sample_embeddings, embedding_dimension ): diff --git a/tests/unit/utils/responses/test_responses_store.py b/tests/unit/utils/responses/test_responses_store.py index 44d4b30da..4e5256c1b 100644 --- a/tests/unit/utils/responses/test_responses_store.py +++ b/tests/unit/utils/responses/test_responses_store.py @@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic(): input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Test 1: First page with limit=2, descending order (default) result = await store.list_responses(limit=2, order=Order.desc) assert len(result.data) == 2 @@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending(): input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Test ascending order pagination result = await store.list_responses(limit=1, order=Order.asc) assert len(result.data) == 1 @@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter(): input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Test pagination with model filter result = await store.list_responses(limit=1, model="model-a", order=Order.desc) assert len(result.data) == 1 @@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit(): input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Test without limit (should use default of 50) result = await store.list_responses(order=Order.desc) assert len(result.data) == 2 @@ -212,6 +224,9 @@ async def test_responses_store_get_response_object(): input_list = [create_test_response_input("Test input content", "input-test-resp")] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Retrieve the response retrieved = await store.get_response_object("test-resp") assert retrieved.id == "test-resp" @@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination(): ] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Verify all items are stored correctly with explicit IDs all_items = await store.list_response_input_items("test-resp", order=Order.desc) assert len(all_items.data) == 5 @@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination(): ] await store.store_response_object(response, input_list) + # Wait for all queued writes to complete + await store.flush() + # Test before pagination with descending order # In desc order: [Fifth, Fourth, Third, Second, First] # before="before-3" should return [Fifth, Fourth]