From f2598d30e66bfa025bc59973423cf92e8cd7d3b4 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 22 Oct 2025 11:39:00 -0700 Subject: [PATCH 1/5] chore: use --no-cache in Containerfile (#3884) # What does this PR do? debugging https://github.com/llamastack/llama-stack-ops/actions/runs/18700984422/job/53329700658 --no-cache was what build_container.sh used ## Test Plan --- containers/Containerfile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/containers/Containerfile b/containers/Containerfile index 1ddf102af..1c878ea9b 100644 --- a/containers/Containerfile +++ b/containers/Containerfile @@ -45,7 +45,7 @@ RUN set -eux; \ exit 1; \ fi -RUN pip install --no-cache-dir uv +RUN pip install --no-cache uv ENV UV_SYSTEM_PYTHON=1 ENV INSTALL_MODE=${INSTALL_MODE} @@ -68,7 +68,7 @@ RUN set -eux; \ echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \ exit 1; \ fi; \ - uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \ + uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \ fi; # Install llama-stack @@ -78,19 +78,19 @@ RUN set -eux; \ echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \ exit 1; \ fi; \ - uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \ + uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \ elif [ "$INSTALL_MODE" = "test-pypi" ]; then \ - uv pip install --no-cache-dir fastapi libcst; \ + uv pip install --no-cache fastapi libcst; \ if [ -n "$TEST_PYPI_VERSION" ]; then \ - uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \ + uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \ else \ - uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \ + uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \ fi; \ else \ if [ -n "$PYPI_VERSION" ]; then \ - uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \ + uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \ else \ - uv pip install --no-cache-dir llama-stack; \ + uv pip install --no-cache llama-stack; \ fi; \ fi; @@ -102,7 +102,7 @@ RUN set -eux; \ fi; \ deps="$(llama stack list-deps "$DISTRO_NAME")"; \ if [ -n "$deps" ]; then \ - printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \ + printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \ fi # Cleanup From bb1ebb3c6b6aff8a5e5b0d20198f4268fe91f817 Mon Sep 17 00:00:00 2001 From: Jiayi Ni Date: Wed, 22 Oct 2025 12:02:28 -0700 Subject: [PATCH 2/5] feat: Add rerank models and rerank API change (#3831) # What does this PR do? - Extend the model type to include rerank models. - Implement `rerank()` method in inference router. - Add `rerank_model_list` to `OpenAIMixin` to enable providers to register and identify rerank models - Update documentation. ## Test Plan ``` pytest tests/unit/providers/utils/inference/test_openai_mixin.py ``` --- docs/docs/providers/inference/index.mdx | 8 +- docs/static/deprecated-llama-stack-spec.html | 2 +- docs/static/deprecated-llama-stack-spec.yaml | 7 +- docs/static/llama-stack-spec.html | 5 +- docs/static/llama-stack-spec.yaml | 8 +- docs/static/stainless-llama-stack-spec.html | 5 +- docs/static/stainless-llama-stack-spec.yaml | 8 +- llama_stack/apis/inference/inference.py | 3 +- llama_stack/apis/models/models.py | 2 + llama_stack/core/routers/inference.py | 22 ++++ .../providers/utils/inference/openai_mixin.py | 41 +++--- .../utils/inference/test_openai_mixin.py | 118 ++++++++++++++++-- 12 files changed, 186 insertions(+), 43 deletions(-) diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index c2bf69962..478611420 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -3,9 +3,10 @@ description: "Inference Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: + This API provides the raw interface to the underlying models. Three kinds of models are supported: - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search." + - Embedding models: these models generate embeddings to be used for semantic search. + - Rerank models: these models reorder the documents based on their relevance to a query." sidebar_label: Inference title: Inference --- @@ -18,8 +19,9 @@ Inference Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: + This API provides the raw interface to the underlying models. Three kinds of models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + - Rerank models: these models reorder the documents based on their relevance to a query. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index d920317cf..e3e182dd7 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -13467,7 +13467,7 @@ }, { "name": "Inference", - "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", + "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.", "x-displayName": "Inference" }, { diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 66b2caeca..6b5b8230a 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -10218,13 +10218,16 @@ tags: embeddings. - This API provides the raw interface to the underlying models. Two kinds of models - are supported: + This API provides the raw interface to the underlying models. Three kinds of + models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + + - Rerank models: these models reorder the documents based on their relevance + to a query. x-displayName: Inference - name: Models description: '' diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 61deaec1e..584127d91 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -6859,7 +6859,8 @@ "type": "string", "enum": [ "llm", - "embedding" + "embedding", + "rerank" ], "title": "ModelType", "description": "Enumeration of supported model types in Llama Stack." @@ -13269,7 +13270,7 @@ }, { "name": "Inference", - "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", + "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.", "x-displayName": "Inference" }, { diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index c6197b36f..90b1b3a2e 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -5269,6 +5269,7 @@ components: enum: - llm - embedding + - rerank title: ModelType description: >- Enumeration of supported model types in Llama Stack. @@ -10190,13 +10191,16 @@ tags: embeddings. - This API provides the raw interface to the underlying models. Two kinds of models - are supported: + This API provides the raw interface to the underlying models. Three kinds of + models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + + - Rerank models: these models reorder the documents based on their relevance + to a query. x-displayName: Inference - name: Inspect description: >- diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 38122ebc0..f2d99a9c7 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -8531,7 +8531,8 @@ "type": "string", "enum": [ "llm", - "embedding" + "embedding", + "rerank" ], "title": "ModelType", "description": "Enumeration of supported model types in Llama Stack." @@ -17959,7 +17960,7 @@ }, { "name": "Inference", - "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", + "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Three kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.\n- Rerank models: these models reorder the documents based on their relevance to a query.", "x-displayName": "Inference" }, { diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 93049a14a..9fe6cb6a3 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -6482,6 +6482,7 @@ components: enum: - llm - embedding + - rerank title: ModelType description: >- Enumeration of supported model types in Llama Stack. @@ -13585,13 +13586,16 @@ tags: embeddings. - This API provides the raw interface to the underlying models. Two kinds of models - are supported: + This API provides the raw interface to the underlying models. Three kinds of + models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + + - Rerank models: these models reorder the documents based on their relevance + to a query. x-displayName: Inference - name: Inspect description: >- diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 027246470..049482837 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -1234,9 +1234,10 @@ class Inference(InferenceProvider): Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: + This API provides the raw interface to the underlying models. Three kinds of models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + - Rerank models: these models reorder the documents based on their relevance to a query. """ @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True) diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 10949cb95..5486e3bfd 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -27,10 +27,12 @@ class ModelType(StrEnum): """Enumeration of supported model types in Llama Stack. :cvar llm: Large language model for text generation and completion :cvar embedding: Embedding model for converting text to vector representations + :cvar rerank: Reranking model for reordering documents based on their relevance to a query """ llm = "llm" embedding = "embedding" + rerank = "rerank" @json_schema_type diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index b20ad44ca..09241d836 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -44,9 +44,14 @@ from llama_stack.apis.inference import ( OpenAIEmbeddingsResponse, OpenAIMessageParam, Order, + RerankResponse, StopReason, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import ( + OpenAIChatCompletionContentPartImageParam, + OpenAIChatCompletionContentPartTextParam, +) from llama_stack.apis.models import Model, ModelType from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry from llama_stack.log import get_logger @@ -182,6 +187,23 @@ class InferenceRouter(Inference): raise ModelTypeError(model_id, model.model_type, expected_model_type) return model + async def rerank( + self, + model: str, + query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam, + items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam], + max_num_results: int | None = None, + ) -> RerankResponse: + logger.debug(f"InferenceRouter.rerank: {model}") + model_obj = await self._get_model(model, ModelType.rerank) + provider = await self.routing_table.get_provider_impl(model_obj.identifier) + return await provider.rerank( + model=model_obj.identifier, + query=query, + items=items, + max_num_results=max_num_results, + ) + async def openai_completion( self, params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)], diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index a9ccc8091..bbd3d2e10 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -48,6 +48,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): - overwrite_completion_id: If True, overwrites the 'id' field in OpenAI responses - download_images: If True, downloads images and converts to base64 for providers that require it - embedding_model_metadata: A dictionary mapping model IDs to their embedding metadata + - construct_model_from_identifier: Method to construct a Model instance corresponding to the given identifier - provider_data_api_key_field: Optional field name in provider data to look for API key - list_provider_model_ids: Method to list available models from the provider - get_extra_client_params: Method to provide extra parameters to the AsyncOpenAI client @@ -121,6 +122,30 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): """ return {} + def construct_model_from_identifier(self, identifier: str) -> Model: + """ + Construct a Model instance corresponding to the given identifier + + Child classes can override this to customize model typing/metadata. + + :param identifier: The provider's model identifier + :return: A Model instance + """ + if metadata := self.embedding_model_metadata.get(identifier): + return Model( + provider_id=self.__provider_id__, # type: ignore[attr-defined] + provider_resource_id=identifier, + identifier=identifier, + model_type=ModelType.embedding, + metadata=metadata, + ) + return Model( + provider_id=self.__provider_id__, # type: ignore[attr-defined] + provider_resource_id=identifier, + identifier=identifier, + model_type=ModelType.llm, + ) + async def list_provider_model_ids(self) -> Iterable[str]: """ List available models from the provider. @@ -416,21 +441,7 @@ class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel): if self.allowed_models and provider_model_id not in self.allowed_models: logger.info(f"Skipping model {provider_model_id} as it is not in the allowed models list") continue - if metadata := self.embedding_model_metadata.get(provider_model_id): - model = Model( - provider_id=self.__provider_id__, # type: ignore[attr-defined] - provider_resource_id=provider_model_id, - identifier=provider_model_id, - model_type=ModelType.embedding, - metadata=metadata, - ) - else: - model = Model( - provider_id=self.__provider_id__, # type: ignore[attr-defined] - provider_resource_id=provider_model_id, - identifier=provider_model_id, - model_type=ModelType.llm, - ) + model = self.construct_model_from_identifier(provider_model_id) self._model_cache[provider_model_id] = model return list(self._model_cache.values()) diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py index 61a1f8f61..d98c096aa 100644 --- a/tests/unit/providers/utils/inference/test_openai_mixin.py +++ b/tests/unit/providers/utils/inference/test_openai_mixin.py @@ -38,6 +38,28 @@ class OpenAIMixinWithEmbeddingsImpl(OpenAIMixinImpl): } +class OpenAIMixinWithCustomModelConstruction(OpenAIMixinImpl): + """Test implementation that uses construct_model_from_identifier to add rerank models""" + + embedding_model_metadata: dict[str, dict[str, int]] = { + "text-embedding-3-small": {"embedding_dimension": 1536, "context_length": 8192}, + "text-embedding-ada-002": {"embedding_dimension": 1536, "context_length": 8192}, + } + + # Adds rerank models via construct_model_from_identifier + rerank_model_ids: set[str] = {"rerank-model-1", "rerank-model-2"} + + def construct_model_from_identifier(self, identifier: str) -> Model: + if identifier in self.rerank_model_ids: + return Model( + provider_id=self.__provider_id__, # type: ignore[attr-defined] + provider_resource_id=identifier, + identifier=identifier, + model_type=ModelType.rerank, + ) + return super().construct_model_from_identifier(identifier) + + @pytest.fixture def mixin(): """Create a test instance of OpenAIMixin with mocked model_store""" @@ -62,6 +84,13 @@ def mixin_with_embeddings(): return OpenAIMixinWithEmbeddingsImpl(config=config) +@pytest.fixture +def mixin_with_custom_model_construction(): + """Create a test instance using custom construct_model_from_identifier""" + config = RemoteInferenceProviderConfig() + return OpenAIMixinWithCustomModelConstruction(config=config) + + @pytest.fixture def mock_models(): """Create multiple mock OpenAI model objects""" @@ -113,6 +142,19 @@ def mock_client_context(): return _mock_client_context +def _assert_models_match_expected(actual_models, expected_models): + """Verify the models match expected attributes. + + Args: + actual_models: List of models to verify + expected_models: Mapping of model identifier to expected attribute values + """ + for identifier, expected_attrs in expected_models.items(): + model = next(m for m in actual_models if m.identifier == identifier) + for attr_name, expected_value in expected_attrs.items(): + assert getattr(model, attr_name) == expected_value + + class TestOpenAIMixinListModels: """Test cases for the list_models method""" @@ -342,21 +384,71 @@ class TestOpenAIMixinEmbeddingModelMetadata: assert result is not None assert len(result) == 2 - # Find the models in the result - embedding_model = next(m for m in result if m.identifier == "text-embedding-3-small") - llm_model = next(m for m in result if m.identifier == "gpt-4") + expected_models = { + "text-embedding-3-small": { + "model_type": ModelType.embedding, + "metadata": {"embedding_dimension": 1536, "context_length": 8192}, + "provider_id": "test-provider", + "provider_resource_id": "text-embedding-3-small", + }, + "gpt-4": { + "model_type": ModelType.llm, + "metadata": {}, + "provider_id": "test-provider", + "provider_resource_id": "gpt-4", + }, + } - # Check embedding model - assert embedding_model.model_type == ModelType.embedding - assert embedding_model.metadata == {"embedding_dimension": 1536, "context_length": 8192} - assert embedding_model.provider_id == "test-provider" - assert embedding_model.provider_resource_id == "text-embedding-3-small" + _assert_models_match_expected(result, expected_models) - # Check LLM model - assert llm_model.model_type == ModelType.llm - assert llm_model.metadata == {} # No metadata for LLMs - assert llm_model.provider_id == "test-provider" - assert llm_model.provider_resource_id == "gpt-4" + +class TestOpenAIMixinCustomModelConstruction: + """Test cases for mixed model types (LLM, embedding, rerank) through construct_model_from_identifier""" + + async def test_mixed_model_types_identification(self, mixin_with_custom_model_construction, mock_client_context): + """Test that LLM, embedding, and rerank models are correctly identified with proper types and metadata""" + # Create mock models: 1 embedding, 1 rerank, 1 LLM + mock_embedding_model = MagicMock(id="text-embedding-3-small") + mock_rerank_model = MagicMock(id="rerank-model-1") + mock_llm_model = MagicMock(id="gpt-4") + mock_models = [mock_embedding_model, mock_rerank_model, mock_llm_model] + + mock_client = MagicMock() + + async def mock_models_list(): + for model in mock_models: + yield model + + mock_client.models.list.return_value = mock_models_list() + + with mock_client_context(mixin_with_custom_model_construction, mock_client): + result = await mixin_with_custom_model_construction.list_models() + + assert result is not None + assert len(result) == 3 + + expected_models = { + "text-embedding-3-small": { + "model_type": ModelType.embedding, + "metadata": {"embedding_dimension": 1536, "context_length": 8192}, + "provider_id": "test-provider", + "provider_resource_id": "text-embedding-3-small", + }, + "rerank-model-1": { + "model_type": ModelType.rerank, + "metadata": {}, + "provider_id": "test-provider", + "provider_resource_id": "rerank-model-1", + }, + "gpt-4": { + "model_type": ModelType.llm, + "metadata": {}, + "provider_id": "test-provider", + "provider_resource_id": "gpt-4", + }, + } + + _assert_models_match_expected(result, expected_models) class TestOpenAIMixinAllowedModels: From 8885cea8d7b3412334a23298ca1a3a836bfc6421 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:32:48 -0700 Subject: [PATCH 3/5] fix(conversations)!: update Conversations API definitions (was: bump openai from 1.107.0 to 2.5.0) (#3847) Bumps [openai](https://github.com/openai/openai-python) from 1.107.0 to 2.5.0.
Release notes

Sourced from openai's releases.

v2.5.0

2.5.0 (2025-10-17)

Full Changelog: v2.4.0...v2.5.0

Features

Chores

  • bump httpx-aiohttp version to 0.1.9 (67f2f0a)

v2.4.0

2.4.0 (2025-10-16)

Full Changelog: v2.3.0...v2.4.0

Features

  • api: Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint (bdbe9b8)

Chores

  • fix dangling comment (da14e99)
  • internal: detect missing future annotations with ruff (2672b8f)

v2.3.0

2.3.0 (2025-10-10)

Full Changelog: v2.2.0...v2.3.0

Features

  • api: comparison filter in/not in (aa49f62)

Chores

  • package: bump jiter to >=0.10.0 to support Python 3.14 (#2618) (aa445ca)

v2.2.0

2.2.0 (2025-10-06)

Full Changelog: v2.1.0...v2.2.0

Features

... (truncated)

Changelog

Sourced from openai's changelog.

2.5.0 (2025-10-17)

Full Changelog: v2.4.0...v2.5.0

Features

Chores

  • bump httpx-aiohttp version to 0.1.9 (67f2f0a)

2.4.0 (2025-10-16)

Full Changelog: v2.3.0...v2.4.0

Features

  • api: Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint (bdbe9b8)

Chores

  • fix dangling comment (da14e99)
  • internal: detect missing future annotations with ruff (2672b8f)

2.3.0 (2025-10-10)

Full Changelog: v2.2.0...v2.3.0

Features

  • api: comparison filter in/not in (aa49f62)

Chores

  • package: bump jiter to >=0.10.0 to support Python 3.14 (#2618) (aa445ca)

2.2.0 (2025-10-06)

Full Changelog: v2.1.0...v2.2.0

Features

  • api: dev day 2025 launches (38ac009)

Bug Fixes

... (truncated)

Commits
  • 513ae76 release: 2.5.0 (#2694)
  • ebf3221 release: 2.4.0
  • e043d7b chore: fix dangling comment
  • 25cbb74 feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions ...
  • 8cdfd06 codegen metadata
  • d5c6443 codegen metadata
  • b20a9e7 chore(internal): detect missing future annotations with ruff
  • e5f93f5 release: 2.3.0
  • 0448788 feat(api): comparison filter in/not in
  • 85a91ad chore(package): bump jiter to >=0.10.0 to support Python 3.14 (#2618)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=openai&package-manager=uv&previous-version=1.107.0&new-version=2.5.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
--------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] Co-authored-by: Ashwin Bharambe --- client-sdks/stainless/openapi.yml | 156 ++++-------------- docs/static/llama-stack-spec.html | 83 +++------- docs/static/llama-stack-spec.yaml | 148 +++-------------- docs/static/stainless-llama-stack-spec.html | 83 +++------- docs/static/stainless-llama-stack-spec.yaml | 148 +++-------------- .../apis/conversations/conversations.py | 28 +++- .../core/conversations/conversations.py | 19 ++- .../responses/openai_responses.py | 2 +- .../unit/conversations/test_conversations.py | 4 +- .../test_openai_responses_conversations.py | 6 +- uv.lock | 6 +- 11 files changed, 169 insertions(+), 514 deletions(-) diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 93049a14a..bd22f2129 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -350,146 +350,46 @@ paths: in: query description: >- An item ID to list items after, used in pagination. - required: true + required: false schema: - oneOf: - - type: string - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string - name: include in: query description: >- Specify additional output data to include in the response. - required: true + required: false schema: - oneOf: - - type: array - items: - type: string - enum: - - code_interpreter_call.outputs - - computer_call_output.output.image_url - - file_search_call.results - - message.input_image.image_url - - message.output_text.logprobs - - reasoning.encrypted_content - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: array + items: + type: string + enum: + - web_search_call.action.sources + - code_interpreter_call.outputs + - computer_call_output.output.image_url + - file_search_call.results + - message.input_image.image_url + - message.output_text.logprobs + - reasoning.encrypted_content + title: ConversationItemInclude + description: >- + Specify additional output data to include in the model response. - name: limit in: query description: >- A limit on the number of objects to be returned (1-100, default 20). - required: true + required: false schema: - oneOf: - - type: integer - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: integer - name: order in: query description: >- The order to return items in (asc or desc, default desc). - required: true + required: false schema: - oneOf: - - type: string - enum: - - asc - - desc - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string + enum: + - asc + - desc deprecated: false post: responses: @@ -6482,6 +6382,7 @@ components: enum: - llm - embedding + - rerank title: ModelType description: >- Enumeration of supported model types in Llama Stack. @@ -13585,13 +13486,16 @@ tags: embeddings. - This API provides the raw interface to the underlying models. Two kinds of models - are supported: + This API provides the raw interface to the underlying models. Three kinds of + models are supported: - LLM models: these models generate "raw" and "chat" (conversational) completions. - Embedding models: these models generate embeddings to be used for semantic search. + + - Rerank models: these models reorder the documents based on their relevance + to a query. x-displayName: Inference - name: Inspect description: >- diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 584127d91..384770954 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -483,86 +483,53 @@ "name": "after", "in": "query", "description": "An item ID to list items after, used in pagination.", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "string" } }, { "name": "include", "in": "query", "description": "Specify additional output data to include in the response.", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "array", - "items": { - "type": "string", - "enum": [ - "code_interpreter_call.outputs", - "computer_call_output.output.image_url", - "file_search_call.results", - "message.input_image.image_url", - "message.output_text.logprobs", - "reasoning.encrypted_content" - ] - } - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "array", + "items": { + "type": "string", + "enum": [ + "web_search_call.action.sources", + "code_interpreter_call.outputs", + "computer_call_output.output.image_url", + "file_search_call.results", + "message.input_image.image_url", + "message.output_text.logprobs", + "reasoning.encrypted_content" + ], + "title": "ConversationItemInclude", + "description": "Specify additional output data to include in the model response." + } } }, { "name": "limit", "in": "query", "description": "A limit on the number of objects to be returned (1-100, default 20).", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "integer" - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "integer" } }, { "name": "order", "in": "query", "description": "The order to return items in (asc or desc, default desc).", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "string", - "enum": [ - "asc", - "desc" - ] - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } + "type": "string", + "enum": [ + "asc", + "desc" ] } } diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 90b1b3a2e..36b9c7153 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -347,146 +347,46 @@ paths: in: query description: >- An item ID to list items after, used in pagination. - required: true + required: false schema: - oneOf: - - type: string - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string - name: include in: query description: >- Specify additional output data to include in the response. - required: true + required: false schema: - oneOf: - - type: array - items: - type: string - enum: - - code_interpreter_call.outputs - - computer_call_output.output.image_url - - file_search_call.results - - message.input_image.image_url - - message.output_text.logprobs - - reasoning.encrypted_content - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: array + items: + type: string + enum: + - web_search_call.action.sources + - code_interpreter_call.outputs + - computer_call_output.output.image_url + - file_search_call.results + - message.input_image.image_url + - message.output_text.logprobs + - reasoning.encrypted_content + title: ConversationItemInclude + description: >- + Specify additional output data to include in the model response. - name: limit in: query description: >- A limit on the number of objects to be returned (1-100, default 20). - required: true + required: false schema: - oneOf: - - type: integer - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: integer - name: order in: query description: >- The order to return items in (asc or desc, default desc). - required: true + required: false schema: - oneOf: - - type: string - enum: - - asc - - desc - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string + enum: + - asc + - desc deprecated: false post: responses: diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index f2d99a9c7..77a64ced0 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -483,86 +483,53 @@ "name": "after", "in": "query", "description": "An item ID to list items after, used in pagination.", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "string" } }, { "name": "include", "in": "query", "description": "Specify additional output data to include in the response.", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "array", - "items": { - "type": "string", - "enum": [ - "code_interpreter_call.outputs", - "computer_call_output.output.image_url", - "file_search_call.results", - "message.input_image.image_url", - "message.output_text.logprobs", - "reasoning.encrypted_content" - ] - } - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "array", + "items": { + "type": "string", + "enum": [ + "web_search_call.action.sources", + "code_interpreter_call.outputs", + "computer_call_output.output.image_url", + "file_search_call.results", + "message.input_image.image_url", + "message.output_text.logprobs", + "reasoning.encrypted_content" + ], + "title": "ConversationItemInclude", + "description": "Specify additional output data to include in the model response." + } } }, { "name": "limit", "in": "query", "description": "A limit on the number of objects to be returned (1-100, default 20).", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "integer" - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } - ] + "type": "integer" } }, { "name": "order", "in": "query", "description": "The order to return items in (asc or desc, default desc).", - "required": true, + "required": false, "schema": { - "oneOf": [ - { - "type": "string", - "enum": [ - "asc", - "desc" - ] - }, - { - "type": "object", - "title": "NotGiven", - "description": "A sentinel singleton class used to distinguish omitted keyword arguments from those passed in with the value None (which may have different behavior).\nFor example:\n\n```py\ndef get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ...\n\n\nget(timeout=1) # 1s timeout\nget(timeout=None) # No timeout\nget() # Default timeout behavior, which may not be statically known at the method definition.\n```" - } + "type": "string", + "enum": [ + "asc", + "desc" ] } } diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 9fe6cb6a3..bd22f2129 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -350,146 +350,46 @@ paths: in: query description: >- An item ID to list items after, used in pagination. - required: true + required: false schema: - oneOf: - - type: string - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string - name: include in: query description: >- Specify additional output data to include in the response. - required: true + required: false schema: - oneOf: - - type: array - items: - type: string - enum: - - code_interpreter_call.outputs - - computer_call_output.output.image_url - - file_search_call.results - - message.input_image.image_url - - message.output_text.logprobs - - reasoning.encrypted_content - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: array + items: + type: string + enum: + - web_search_call.action.sources + - code_interpreter_call.outputs + - computer_call_output.output.image_url + - file_search_call.results + - message.input_image.image_url + - message.output_text.logprobs + - reasoning.encrypted_content + title: ConversationItemInclude + description: >- + Specify additional output data to include in the model response. - name: limit in: query description: >- A limit on the number of objects to be returned (1-100, default 20). - required: true + required: false schema: - oneOf: - - type: integer - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: integer - name: order in: query description: >- The order to return items in (asc or desc, default desc). - required: true + required: false schema: - oneOf: - - type: string - enum: - - asc - - desc - - type: object - title: NotGiven - description: >- - A sentinel singleton class used to distinguish omitted keyword arguments - from those passed in with the value None (which may have different - behavior). - - For example: - - - ```py - - def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: - ... - - - - get(timeout=1) # 1s timeout - - get(timeout=None) # No timeout - - get() # Default timeout behavior, which may not be statically known - at the method definition. - - ``` + type: string + enum: + - asc + - desc deprecated: false post: responses: diff --git a/llama_stack/apis/conversations/conversations.py b/llama_stack/apis/conversations/conversations.py index d7752995d..3b6c50a03 100644 --- a/llama_stack/apis/conversations/conversations.py +++ b/llama_stack/apis/conversations/conversations.py @@ -4,11 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from enum import StrEnum from typing import Annotated, Literal, Protocol, runtime_checkable -from openai import NOT_GIVEN -from openai._types import NotGiven -from openai.types.responses.response_includable import ResponseIncludable from pydantic import BaseModel, Field from llama_stack.apis.agents.openai_responses import ( @@ -150,6 +148,20 @@ class ConversationItemCreateRequest(BaseModel): ) +class ConversationItemInclude(StrEnum): + """ + Specify additional output data to include in the model response. + """ + + web_search_call_action_sources = "web_search_call.action.sources" + code_interpreter_call_outputs = "code_interpreter_call.outputs" + computer_call_output_output_image_url = "computer_call_output.output.image_url" + file_search_call_results = "file_search_call.results" + message_input_image_image_url = "message.input_image.image_url" + message_output_text_logprobs = "message.output_text.logprobs" + reasoning_encrypted_content = "reasoning.encrypted_content" + + @json_schema_type class ConversationItemList(BaseModel): """List of conversation items with pagination.""" @@ -250,13 +262,13 @@ class Conversations(Protocol): ... @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1) - async def list( + async def list_items( self, conversation_id: str, - after: str | NotGiven = NOT_GIVEN, - include: list[ResponseIncludable] | NotGiven = NOT_GIVEN, - limit: int | NotGiven = NOT_GIVEN, - order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN, + after: str | None = None, + include: list[ConversationItemInclude] | None = None, + limit: int | None = None, + order: Literal["asc", "desc"] | None = None, ) -> ConversationItemList: """List items. diff --git a/llama_stack/core/conversations/conversations.py b/llama_stack/core/conversations/conversations.py index 66880ca36..83a49e848 100644 --- a/llama_stack/core/conversations/conversations.py +++ b/llama_stack/core/conversations/conversations.py @@ -6,9 +6,8 @@ import secrets import time -from typing import Any +from typing import Any, Literal -from openai import NOT_GIVEN from pydantic import BaseModel, TypeAdapter from llama_stack.apis.conversations.conversations import ( @@ -16,6 +15,7 @@ from llama_stack.apis.conversations.conversations import ( ConversationDeletedResource, ConversationItem, ConversationItemDeletedResource, + ConversationItemInclude, ConversationItemList, Conversations, Metadata, @@ -247,7 +247,14 @@ class ConversationServiceImpl(Conversations): adapter: TypeAdapter[ConversationItem] = TypeAdapter(ConversationItem) return adapter.validate_python(record["item_data"]) - async def list(self, conversation_id: str, after=NOT_GIVEN, include=NOT_GIVEN, limit=NOT_GIVEN, order=NOT_GIVEN): + async def list_items( + self, + conversation_id: str, + after: str | None = None, + include: list[ConversationItemInclude] | None = None, + limit: int | None = None, + order: Literal["asc", "desc"] | None = None, + ) -> ConversationItemList: """List items in the conversation.""" if not conversation_id: raise ValueError(f"Expected a non-empty value for `conversation_id` but received {conversation_id!r}") @@ -258,14 +265,12 @@ class ConversationServiceImpl(Conversations): result = await self.sql_store.fetch_all(table="conversation_items", where={"conversation_id": conversation_id}) records = result.data - if order != NOT_GIVEN and order == "asc": + if order is not None and order == "asc": records.sort(key=lambda x: x["created_at"]) else: records.sort(key=lambda x: x["created_at"], reverse=True) - actual_limit = 20 - if limit != NOT_GIVEN and isinstance(limit, int): - actual_limit = limit + actual_limit = limit or 20 records = records[:actual_limit] items = [record["item_data"] for record in records] diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index 2360dafd9..e2508bec1 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -131,7 +131,7 @@ class OpenAIResponsesImpl: tool_context.recover_tools_from_previous_response(previous_response) elif conversation is not None: - conversation_items = await self.conversations_api.list(conversation, order="asc") + conversation_items = await self.conversations_api.list_items(conversation, order="asc") # Use stored messages as source of truth (like previous_response.messages) stored_messages = await self.responses_store.get_conversation_messages(conversation) diff --git a/tests/unit/conversations/test_conversations.py b/tests/unit/conversations/test_conversations.py index ff6dd243d..3f0175831 100644 --- a/tests/unit/conversations/test_conversations.py +++ b/tests/unit/conversations/test_conversations.py @@ -82,7 +82,7 @@ async def test_conversation_items(service): assert len(item_list.data) == 1 assert item_list.data[0].id == "msg_test123" - items = await service.list(conversation.id) + items = await service.list_items(conversation.id) assert len(items.data) == 1 @@ -120,7 +120,7 @@ async def test_openai_type_compatibility(service): assert hasattr(item_list, attr) assert item_list.object == "list" - items = await service.list(conversation.id) + items = await service.list_items(conversation.id) item = await service.retrieve(conversation.id, items.data[0].id) item_dict = item.model_dump() diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses_conversations.py b/tests/unit/providers/agents/meta_reference/test_openai_responses_conversations.py index 2ca350862..c2c113c1b 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses_conversations.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses_conversations.py @@ -62,7 +62,7 @@ class TestConversationValidation: conv_id = "conv_nonexistent" # Mock conversation not found - mock_conversations_api.list.side_effect = ConversationNotFoundError("conv_nonexistent") + mock_conversations_api.list_items.side_effect = ConversationNotFoundError("conv_nonexistent") with pytest.raises(ConversationNotFoundError): await responses_impl_with_conversations.create_openai_response( @@ -160,7 +160,7 @@ class TestIntegrationWorkflow: self, responses_impl_with_conversations, mock_conversations_api ): """Test creating a response with a valid conversation parameter.""" - mock_conversations_api.list.return_value = ConversationItemList( + mock_conversations_api.list_items.return_value = ConversationItemList( data=[], first_id=None, has_more=False, last_id=None, object="list" ) @@ -227,7 +227,7 @@ class TestIntegrationWorkflow: self, responses_impl_with_conversations, mock_conversations_api ): """Test creating a response with a non-existent conversation.""" - mock_conversations_api.list.side_effect = ConversationNotFoundError("conv_nonexistent") + mock_conversations_api.list_items.side_effect = ConversationNotFoundError("conv_nonexistent") with pytest.raises(ConversationNotFoundError) as exc_info: await responses_impl_with_conversations.create_openai_response( diff --git a/uv.lock b/uv.lock index 82c6a3ae6..aad77f6a1 100644 --- a/uv.lock +++ b/uv.lock @@ -2661,7 +2661,7 @@ wheels = [ [[package]] name = "openai" -version = "1.107.0" +version = "2.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2673,9 +2673,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" } +sdist = { url = "https://files.pythonhosted.org/packages/72/39/aa3767c920c217ef56f27e89cbe3aaa43dd6eea3269c95f045c5761b9df1/openai-2.5.0.tar.gz", hash = "sha256:f8fa7611f96886a0f31ac6b97e58bc0ada494b255ee2cfd51c8eb502cfcb4814", size = 590333, upload-time = "2025-10-17T18:14:47.669Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" }, + { url = "https://files.pythonhosted.org/packages/14/f3/ebbd700d8dc1e6380a7a382969d96bc0cbea8717b52fb38ff0ca2a7653e8/openai-2.5.0-py3-none-any.whl", hash = "sha256:21380e5f52a71666dbadbf322dd518bdf2b9d11ed0bb3f96bea17310302d6280", size = 999851, upload-time = "2025-10-17T18:14:45.528Z" }, ] [[package]] From cb2185b9361fca9a213b710f4f9e3bef5c973b52 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 22 Oct 2025 13:06:54 -0700 Subject: [PATCH 4/5] fix(logging): ensure logs go to stderr, loggers obey levels (#3885) Important fix to the logging system --- llama_stack/log.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/llama_stack/log.py b/llama_stack/log.py index dc39f6881..15e628cc3 100644 --- a/llama_stack/log.py +++ b/llama_stack/log.py @@ -137,7 +137,8 @@ class CustomRichHandler(RichHandler): # Set a reasonable default width for console output, especially when redirected to files console_width = int(os.environ.get("LLAMA_STACK_LOG_WIDTH", "120")) # Don't force terminal codes to avoid ANSI escape codes in log files - kwargs["console"] = Console(width=console_width) + # Ensure logs go to stderr, not stdout + kwargs["console"] = Console(width=console_width, stderr=True) super().__init__(*args, **kwargs) def emit(self, record): @@ -177,6 +178,7 @@ def setup_logging(category_levels: dict[str, int] | None = None, log_file: str | log_file (str | None): Path to a log file to additionally pipe the logs into. If None, reads from LLAMA_STACK_LOG_FILE environment variable. """ + global _category_levels # Read from environment variables if not explicitly provided if category_levels is None: category_levels = dict.fromkeys(CATEGORIES, DEFAULT_LOG_LEVEL) @@ -184,6 +186,9 @@ def setup_logging(category_levels: dict[str, int] | None = None, log_file: str | if env_config: category_levels.update(parse_environment_config(env_config)) + # Update the module-level _category_levels so that already-created loggers pick up the new levels + _category_levels.update(category_levels) + if log_file is None: log_file = os.environ.get("LLAMA_STACK_LOG_FILE") log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s" @@ -268,14 +273,18 @@ def setup_logging(category_levels: dict[str, int] | None = None, log_file: str | } dictConfig(logging_config) - # Ensure third-party libraries follow the root log level, but preserve - # already-configured loggers (e.g., uvicorn) and our own llama_stack loggers + # Update log levels for all loggers that were created before setup_logging was called for name, logger in logging.root.manager.loggerDict.items(): if isinstance(logger, logging.Logger): - # Skip infrastructure loggers (uvicorn, fastapi) and our own loggers - if name.startswith(("uvicorn", "fastapi", "llama_stack")): + # Skip infrastructure loggers (uvicorn, fastapi) to preserve their configured levels + if name.startswith(("uvicorn", "fastapi")): continue - logger.setLevel(root_level) + # Update llama_stack loggers if root level was explicitly set (e.g., via all=CRITICAL) + if name.startswith("llama_stack") and "root" in category_levels: + logger.setLevel(root_level) + # Update third-party library loggers + elif not name.startswith("llama_stack"): + logger.setLevel(root_level) def get_logger( From 405d0e80016827b7e9a4bd40d654b919ed73f015 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Wed, 22 Oct 2025 14:19:12 -0700 Subject: [PATCH 5/5] chore: better error messages for moderations API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? ## Test Plan ``` ~/projects/lst3 remotes/origin/HEAD* .venv ❯ curl http://localhost:8321/v1/moderations \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-4o-mini", "input": [ "hello" ] }' {"detail":"Invalid value: No shield associated with provider_resource id gpt-4o-mini: choose from ['together/meta-llama/Llama-Guard-4-12B']"} ``` --- llama_stack/core/routers/safety.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llama_stack/core/routers/safety.py b/llama_stack/core/routers/safety.py index 9ba3327f1..d171c9721 100644 --- a/llama_stack/core/routers/safety.py +++ b/llama_stack/core/routers/safety.py @@ -65,12 +65,16 @@ class SafetyRouter(Safety): """Get Shield id from model (provider_resource_id) of shield.""" list_shields_response = await self.routing_table.list_shields() - matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id] + matches: list[str] = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id] if not matches: - raise ValueError(f"No shield associated with provider_resource id {model}") + raise ValueError( + f"No shield associated with provider_resource id {model}: choose from {[s.provider_resource_id for s in list_shields_response.data]}" + ) if len(matches) > 1: - raise ValueError(f"Multiple shields associated with provider_resource id {model}") + raise ValueError( + f"Multiple shields associated with provider_resource id {model}: matched shields {matches}" + ) return matches[0] shield_id = await get_shield_id(self, model)