From 935b8e28de29400a4b42d8b54169341c5244fec7 Mon Sep 17 00:00:00 2001 From: slekkala1 Date: Wed, 10 Sep 2025 08:48:01 -0700 Subject: [PATCH 01/30] fix: Fireworks chat completion broken due to telemetry (#3392) # What does this PR do? Fix fireworks chat completion broken due to telemetry expecting response.usage Closes https://github.com/llamastack/llama-stack/issues/3391 ## Test Plan 1. `uv run --with llama-stack llama stack build --distro starter --image-type venv --run` Try ``` curl -X POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct", "messages": [{"role": "user", "content": "Hello!"}] }' ``` ``` {"id":"chatcmpl-ee922a08-0df0-4974-b0d3-b322113e8bc0","choices":[{"message":{"role":"assistant","content":"Hello! How can I assist you today?","name":null,"tool_calls":null},"finish_reason":"stop","index":0,"logprobs":null}],"object":"chat.completion","created":1757456375,"model":"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct"}% ``` Without fix fails as mentioned in https://github.com/llamastack/llama-stack/issues/3391 Co-authored-by: Francisco Arceo --- llama_stack/core/routers/inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 045093fe0..23972deb5 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -423,7 +423,7 @@ class InferenceRouter(Inference): # response_stream = await provider.openai_completion(**params) response = await provider.openai_completion(**params) - if self.telemetry: + if self.telemetry and getattr(response, "usage", None): metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, completion_tokens=response.usage.completion_tokens, @@ -529,7 +529,7 @@ class InferenceRouter(Inference): if self.store: asyncio.create_task(self.store.store_chat_completion(response, messages)) - if self.telemetry: + if self.telemetry and getattr(response, "usage", None): metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, completion_tokens=response.usage.completion_tokens, From f6bf36343df7c69c9f26ae5163cbfb6491ca7247 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 10 Sep 2025 11:52:23 -0700 Subject: [PATCH 02/30] chore: logging perf improvments (#3393) # What does this PR do? - Use BackgroundLogger when logging metric events. - Reuse event loop in BackgroundLogger ## Test Plan ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` ### RPS from 57 -> 62 --- llama_stack/core/routers/inference.py | 14 ++++---- .../providers/utils/telemetry/tracing.py | 34 +++++++++++++------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 23972deb5..9593dd5b9 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable from llama_stack.providers.utils.inference.inference_store import InferenceStore -from llama_stack.providers.utils.telemetry.tracing import get_current_span +from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span logger = get_logger(name=__name__, category="core::routers") @@ -160,7 +160,7 @@ class InferenceRouter(Inference): metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) if self.telemetry: for metric in metrics: - await self.telemetry.log_event(metric) + enqueue_event(metric) return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] async def _count_tokens( @@ -431,7 +431,7 @@ class InferenceRouter(Inference): model=model_obj, ) for metric in metrics: - await self.telemetry.log_event(metric) + enqueue_event(metric) # these metrics will show up in the client response. response.metrics = ( @@ -537,7 +537,7 @@ class InferenceRouter(Inference): model=model_obj, ) for metric in metrics: - await self.telemetry.log_event(metric) + enqueue_event(metric) # these metrics will show up in the client response. response.metrics = ( metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics @@ -664,7 +664,7 @@ class InferenceRouter(Inference): "completion_tokens", "total_tokens", ]: # Only log completion and total tokens - await self.telemetry.log_event(metric) + enqueue_event(metric) # Return metrics in response async_metrics = [ @@ -710,7 +710,7 @@ class InferenceRouter(Inference): ) for metric in completion_metrics: if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens - await self.telemetry.log_event(metric) + enqueue_event(metric) # Return metrics in response return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics] @@ -806,7 +806,7 @@ class InferenceRouter(Inference): model=model, ) for metric in metrics: - await self.telemetry.log_event(metric) + enqueue_event(metric) yield chunk finally: diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index 7694003b5..9969b1055 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -18,6 +18,7 @@ from functools import wraps from typing import Any from llama_stack.apis.telemetry import ( + Event, LogSeverity, Span, SpanEndPayload, @@ -98,7 +99,7 @@ class BackgroundLogger: def __init__(self, api: Telemetry, capacity: int = 100000): self.api = api self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) - self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) + self.worker_thread = threading.Thread(target=self._worker, daemon=True) self.worker_thread.start() self._last_queue_full_log_time: float = 0.0 self._dropped_since_last_notice: int = 0 @@ -118,12 +119,16 @@ class BackgroundLogger: self._last_queue_full_log_time = current_time self._dropped_since_last_notice = 0 - def _process_logs(self): + def _worker(self): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.run_until_complete(self._process_logs()) + + async def _process_logs(self): while True: try: event = self.log_queue.get() - # figure out how to use a thread's native loop - asyncio.run(self.api.log_event(event)) + await self.api.log_event(event) except Exception: import traceback @@ -136,6 +141,19 @@ class BackgroundLogger: self.log_queue.join() +def enqueue_event(event: Event) -> None: + """Enqueue a telemetry event to the background logger if available. + + This provides a non-blocking path for routers and other hot paths to + submit telemetry without awaiting the Telemetry API, reducing contention + with the main event loop. + """ + global BACKGROUND_LOGGER + if BACKGROUND_LOGGER is None: + raise RuntimeError("Telemetry API not initialized") + BACKGROUND_LOGGER.log_event(event) + + class TraceContext: spans: list[Span] = [] @@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler): if record.module in ("asyncio", "selector_events"): return - global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER - - if BACKGROUND_LOGGER is None: - raise RuntimeError("Telemetry API not initialized") - + global CURRENT_TRACE_CONTEXT context = CURRENT_TRACE_CONTEXT.get() if context is None: return @@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler): if span is None: return - BACKGROUND_LOGGER.log_event( + enqueue_event( UnstructuredLogEvent( trace_id=span.trace_id, span_id=span.span_id, From a6b1588dc612df097d4fecce317547515b281ec6 Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Wed, 10 Sep 2025 12:53:38 -0600 Subject: [PATCH 03/30] revert: Fireworks chat completion broken due to telemetry (#3402) Reverts llamastack/llama-stack#3392 --- llama_stack/core/routers/inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 9593dd5b9..2ed2d0439 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -423,7 +423,7 @@ class InferenceRouter(Inference): # response_stream = await provider.openai_completion(**params) response = await provider.openai_completion(**params) - if self.telemetry and getattr(response, "usage", None): + if self.telemetry: metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, completion_tokens=response.usage.completion_tokens, @@ -529,7 +529,7 @@ class InferenceRouter(Inference): if self.store: asyncio.create_task(self.store.store_chat_completion(response, messages)) - if self.telemetry and getattr(response, "usage", None): + if self.telemetry: metrics = self._construct_metrics( prompt_tokens=response.usage.prompt_tokens, completion_tokens=response.usage.completion_tokens, From e6edc1f93425032f35f4198a197ba31b5b11d8ee Mon Sep 17 00:00:00 2001 From: Derek Higgins Date: Wed, 10 Sep 2025 19:54:10 +0100 Subject: [PATCH 04/30] fix: unbound variable error in schedule-record-workflow.sh (#3401) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Initialize INPUTS variable to prevent 'unbound variable' error Fixes: ./scripts/github/schedule-record-workflow.sh: line 246: INPUTS: unbound variable │ --- scripts/github/schedule-record-workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/github/schedule-record-workflow.sh b/scripts/github/schedule-record-workflow.sh index c292e53e6..44b0947b6 100755 --- a/scripts/github/schedule-record-workflow.sh +++ b/scripts/github/schedule-record-workflow.sh @@ -239,8 +239,9 @@ echo "Test pattern: ${TEST_PATTERN:-"(none)"}" echo "" # Prepare inputs for gh workflow run +INPUTS= if [[ -n "$TEST_SUBDIRS" ]]; then - INPUTS="-f subdirs='$TEST_SUBDIRS'" + INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'" fi if [[ -n "$TEST_SETUP" ]]; then INPUTS="$INPUTS -f test-setup='$TEST_SETUP'" From e980436a2ed98dd725f76dfcec12235ed1d6cc82 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 10 Sep 2025 11:57:42 -0700 Subject: [PATCH 05/30] chore: introduce write queue for inference_store (#3383) # What does this PR do? Adds a write worker queue for writes to inference store. This avoids overwhelming request processing with slow inference writes. ## Test Plan Benchmark: ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` ## RPS from 21 -> 57 --- .../distributions/k8s-benchmark/benchmark.py | 19 ++-- .../k8s-benchmark/stack_run_config.yaml | 9 ++ llama_stack/core/datatypes.py | 13 ++- llama_stack/core/routers/__init__.py | 5 +- llama_stack/core/routers/inference.py | 5 + .../utils/inference/inference_store.py | 98 +++++++++++++++++-- .../utils/inference/test_inference_store.py | 12 +++ 7 files changed, 139 insertions(+), 22 deletions(-) diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/docs/source/distributions/k8s-benchmark/benchmark.py index 3d0d18150..83ba9602a 100644 --- a/docs/source/distributions/k8s-benchmark/benchmark.py +++ b/docs/source/distributions/k8s-benchmark/benchmark.py @@ -58,14 +58,6 @@ class BenchmarkStats: print(f"\n{'='*60}") print(f"BENCHMARK RESULTS") - print(f"{'='*60}") - print(f"Total time: {total_time:.2f}s") - print(f"Concurrent users: {self.concurrent_users}") - print(f"Total requests: {self.total_requests}") - print(f"Successful requests: {self.success_count}") - print(f"Failed requests: {len(self.errors)}") - print(f"Success rate: {success_rate:.1f}%") - print(f"Requests per second: {self.success_count / total_time:.2f}") print(f"\nResponse Time Statistics:") print(f" Mean: {statistics.mean(self.response_times):.3f}s") @@ -106,6 +98,15 @@ class BenchmarkStats: print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}") print(f" Total chunks received: {sum(self.chunks_received)}") + print(f"{'='*60}") + print(f"Total time: {total_time:.2f}s") + print(f"Concurrent users: {self.concurrent_users}") + print(f"Total requests: {self.total_requests}") + print(f"Successful requests: {self.success_count}") + print(f"Failed requests: {len(self.errors)}") + print(f"Success rate: {success_rate:.1f}%") + print(f"Requests per second: {self.success_count / total_time:.2f}") + if self.errors: print(f"\nErrors (showing first 5):") for error in self.errors[:5]: @@ -215,7 +216,7 @@ class LlamaStackBenchmark: await asyncio.sleep(1) # Report every second if time.time() >= last_report_time + 10: # Report every 10 seconds elapsed = time.time() - stats.start_time - print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s") + print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}") last_report_time = time.time() except asyncio.CancelledError: break diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml index f8ff7811b..5a9e2ae4f 100644 --- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml +++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml @@ -2,6 +2,7 @@ version: '2' image_name: kubernetes-benchmark-demo apis: - agents +- files - inference - files - safety @@ -20,6 +21,14 @@ providers: - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db vector_io: - provider_id: ${env.ENABLE_CHROMADB:+chromadb} provider_type: remote::chromadb diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py index 0f348b067..faaeefd01 100644 --- a/llama_stack/core/datatypes.py +++ b/llama_stack/core/datatypes.py @@ -431,6 +431,12 @@ class ServerConfig(BaseModel): ) +class InferenceStoreConfig(BaseModel): + sql_store_config: SqlStoreConfig + max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store") + num_writers: int = Field(default=4, description="Number of concurrent background writers") + + class StackRunConfig(BaseModel): version: int = LLAMA_STACK_RUN_CONFIG_VERSION @@ -464,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no a default SQLite store will be used.""", ) - inference_store: SqlStoreConfig | None = Field( + inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field( default=None, description=""" -Configuration for the persistence store used by the inference API. If not specified, -a default SQLite store will be used.""", +Configuration for the persistence store used by the inference API. Can be either a +InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated). +If not specified, a default SQLite store will be used.""", ) # registry of "resources" in the distribution diff --git a/llama_stack/core/routers/__init__.py b/llama_stack/core/routers/__init__.py index 1faace34a..f129f8ede 100644 --- a/llama_stack/core/routers/__init__.py +++ b/llama_stack/core/routers/__init__.py @@ -78,7 +78,10 @@ async def get_auto_router_impl( # TODO: move pass configs to routers instead if api == Api.inference and run_config.inference_store: - inference_store = InferenceStore(run_config.inference_store, policy) + inference_store = InferenceStore( + config=run_config.inference_store, + policy=policy, + ) await inference_store.initialize() api_to_dep_impl["store"] = inference_store diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 2ed2d0439..762d7073e 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -90,6 +90,11 @@ class InferenceRouter(Inference): async def shutdown(self) -> None: logger.debug("InferenceRouter.shutdown") + if self.store: + try: + await self.store.shutdown() + except Exception as e: + logger.warning(f"Error during InferenceStore shutdown: {e}") async def register_model( self, diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py index 43006cfd5..8c69b1683 100644 --- a/llama_stack/providers/utils/inference/inference_store.py +++ b/llama_stack/providers/utils/inference/inference_store.py @@ -3,6 +3,9 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import asyncio +from typing import Any + from llama_stack.apis.inference import ( ListOpenAIChatCompletionResponse, OpenAIChatCompletion, @@ -10,24 +13,43 @@ from llama_stack.apis.inference import ( OpenAIMessageParam, Order, ) -from llama_stack.core.datatypes import AccessRule -from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR +from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig +from llama_stack.log import get_logger from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore -from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl +from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl + +logger = get_logger(name=__name__, category="inference_store") class InferenceStore: - def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]): - if not sql_store_config: - sql_store_config = SqliteSqlStoreConfig( - db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(), + def __init__( + self, + config: InferenceStoreConfig | SqlStoreConfig, + policy: list[AccessRule], + ): + # Handle backward compatibility + if not isinstance(config, InferenceStoreConfig): + # Legacy: SqlStoreConfig passed directly as config + config = InferenceStoreConfig( + sql_store_config=config, ) - self.sql_store_config = sql_store_config + + self.config = config + self.sql_store_config = config.sql_store_config self.sql_store = None self.policy = policy + # Disable write queue for SQLite to avoid concurrency issues + self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite + + # Async write queue and worker control + self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None + self._worker_tasks: list[asyncio.Task[Any]] = [] + self._max_write_queue_size: int = config.max_write_queue_size + self._num_writers: int = max(1, config.num_writers) + async def initialize(self): """Create the necessary tables if they don't exist.""" self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config)) @@ -42,10 +64,68 @@ class InferenceStore: }, ) + if self.enable_write_queue: + self._queue = asyncio.Queue(maxsize=self._max_write_queue_size) + for _ in range(self._num_writers): + self._worker_tasks.append(asyncio.create_task(self._worker_loop())) + else: + logger.info("Write queue disabled for SQLite to avoid concurrency issues") + + async def shutdown(self) -> None: + if not self._worker_tasks: + return + if self._queue is not None: + await self._queue.join() + for t in self._worker_tasks: + if not t.done(): + t.cancel() + for t in self._worker_tasks: + try: + await t + except asyncio.CancelledError: + pass + self._worker_tasks.clear() + + async def flush(self) -> None: + """Wait for all queued writes to complete. Useful for testing.""" + if self.enable_write_queue and self._queue is not None: + await self._queue.join() + async def store_chat_completion( self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam] ) -> None: - if not self.sql_store: + if self.enable_write_queue: + if self._queue is None: + raise ValueError("Inference store is not initialized") + try: + self._queue.put_nowait((chat_completion, input_messages)) + except asyncio.QueueFull: + logger.warning( + f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '')}" + ) + await self._queue.put((chat_completion, input_messages)) + else: + await self._write_chat_completion(chat_completion, input_messages) + + async def _worker_loop(self) -> None: + assert self._queue is not None + while True: + try: + item = await self._queue.get() + except asyncio.CancelledError: + break + chat_completion, input_messages = item + try: + await self._write_chat_completion(chat_completion, input_messages) + except Exception as e: # noqa: BLE001 + logger.error(f"Error writing chat completion: {e}") + finally: + self._queue.task_done() + + async def _write_chat_completion( + self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam] + ) -> None: + if self.sql_store is None: raise ValueError("Inference store is not initialized") data = chat_completion.model_dump() diff --git a/tests/unit/utils/inference/test_inference_store.py b/tests/unit/utils/inference/test_inference_store.py index 730f54a05..f6d63490a 100644 --- a/tests/unit/utils/inference/test_inference_store.py +++ b/tests/unit/utils/inference/test_inference_store.py @@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic(): input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] await store.store_chat_completion(completion, input_messages) + # Wait for all queued writes to complete + await store.flush() + # Test 1: First page with limit=2, descending order (default) result = await store.list_chat_completions(limit=2, order=Order.desc) assert len(result.data) == 2 @@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending(): input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] await store.store_chat_completion(completion, input_messages) + # Wait for all queued writes to complete + await store.flush() + # Test ascending order pagination result = await store.list_chat_completions(limit=1, order=Order.asc) assert len(result.data) == 1 @@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter(): input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] await store.store_chat_completion(completion, input_messages) + # Wait for all queued writes to complete + await store.flush() + # Test pagination with model filter result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc) assert len(result.data) == 1 @@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit(): input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] await store.store_chat_completion(completion, input_messages) + # Wait for all queued writes to complete + await store.flush() + # Test without limit result = await store.list_chat_completions(order=Order.desc) assert len(result.data) == 2 From 7394828c7a84de2c3af0ca37546db17d6a703507 Mon Sep 17 00:00:00 2001 From: Alexey Rybak <50731695+reluctantfuturist@users.noreply.github.com> Date: Wed, 10 Sep 2025 12:43:36 -0700 Subject: [PATCH 06/30] docs: horizontal nav bar (#3407) # What does this PR do? * Adds a horizontal nav bar for easy access to the API reference and the Llama Stack Github repo image ## Test Plan * Built the docs and ran the local HTML server to verify changes --- docs/_static/css/my_theme.css | 101 ++++++++++++++++++++++++++++++ docs/_static/js/horizontal_nav.js | 44 +++++++++++++ docs/source/conf.py | 1 + 3 files changed, 146 insertions(+) create mode 100644 docs/_static/js/horizontal_nav.js diff --git a/docs/_static/css/my_theme.css b/docs/_static/css/my_theme.css index d078ec057..7dcd97c9b 100644 --- a/docs/_static/css/my_theme.css +++ b/docs/_static/css/my_theme.css @@ -1,5 +1,106 @@ @import url("theme.css"); +/* Horizontal Navigation Bar */ +.horizontal-nav { + background-color: #ffffff; + border-bottom: 1px solid #e5e5e5; + padding: 0; + position: fixed; + top: 0; + left: 0; + right: 0; + z-index: 1050; + height: 50px; + box-shadow: 0 2px 4px rgba(0,0,0,0.1); +} + +[data-theme="dark"] .horizontal-nav { + background-color: #1a1a1a; + border-bottom: 1px solid #333; +} + +.horizontal-nav .nav-container { + max-width: 1200px; + margin: 0 auto; + display: flex; + align-items: center; + justify-content: space-between; + padding: 0 20px; + height: 100%; +} + +.horizontal-nav .nav-brand { + font-size: 18px; + font-weight: 600; + color: #333; + text-decoration: none; +} + +[data-theme="dark"] .horizontal-nav .nav-brand { + color: #fff; +} + +.horizontal-nav .nav-links { + display: flex; + align-items: center; + gap: 30px; + list-style: none; + margin: 0; + padding: 0; +} + +.horizontal-nav .nav-links a { + color: #666; + text-decoration: none; + font-size: 14px; + font-weight: 500; + padding: 8px 12px; + border-radius: 6px; + transition: all 0.2s ease; +} + +.horizontal-nav .nav-links a:hover, +.horizontal-nav .nav-links a.active { + color: #333; + background-color: #f5f5f5; +} + +.horizontal-nav .nav-links a.active { + font-weight: 600; +} + +[data-theme="dark"] .horizontal-nav .nav-links a { + color: #ccc; +} + +[data-theme="dark"] .horizontal-nav .nav-links a:hover, +[data-theme="dark"] .horizontal-nav .nav-links a.active { + color: #fff; + background-color: #333; +} + +.horizontal-nav .nav-links .github-link { + display: flex; + align-items: center; + gap: 6px; +} + +.horizontal-nav .nav-links .github-icon { + width: 16px; + height: 16px; + fill: currentColor; +} + +/* Adjust main content to account for fixed nav */ +.wy-nav-side { + top: 50px; + height: calc(100vh - 50px); +} + +.wy-nav-content-wrap { + margin-top: 50px; +} + .wy-nav-content { max-width: 90%; } diff --git a/docs/_static/js/horizontal_nav.js b/docs/_static/js/horizontal_nav.js new file mode 100644 index 000000000..c2384f9d5 --- /dev/null +++ b/docs/_static/js/horizontal_nav.js @@ -0,0 +1,44 @@ +// Horizontal Navigation Bar for Llama Stack Documentation +document.addEventListener('DOMContentLoaded', function() { + // Create the horizontal navigation HTML + const navHTML = ` + + `; + + // Insert the navigation at the beginning of the body + document.body.insertAdjacentHTML('afterbegin', navHTML); + + // Update navigation links based on current page + updateActiveNav(); +}); + +function updateActiveNav() { + const currentPath = window.location.pathname; + const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a'); + + navLinks.forEach(link => { + // Remove any existing active classes + link.classList.remove('active'); + + // Add active class based on current path + if (currentPath === '/' && link.getAttribute('href') === '/') { + link.classList.add('active'); + } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) { + link.classList.add('active'); + } + }); +} diff --git a/docs/source/conf.py b/docs/source/conf.py index 3f84d1310..0cbddef31 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,6 +131,7 @@ html_static_path = ["../_static"] def setup(app): app.add_css_file("css/my_theme.css") app.add_js_file("js/detect_theme.js") + app.add_js_file("js/horizontal_nav.js") app.add_js_file("js/keyboard_shortcuts.js") def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): From a844c4f6e189395f99a6470552876d1ba6b807f1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:17:02 -0700 Subject: [PATCH 07/30] chore(python-deps): bump pytest from 8.4.1 to 8.4.2 (#3359) Bumps [pytest](https://github.com/pytest-dev/pytest) from 8.4.1 to 8.4.2.
Release notes

Sourced from pytest's releases.

8.4.2

pytest 8.4.2 (2025-09-03)

Bug fixes

  • #13478: Fixed a crash when using console_output_style{.interpreted-text role="confval"} with times and a module is skipped.

  • #13530: Fixed a crash when using pytest.approx{.interpreted-text role="func"} and decimal.Decimal{.interpreted-text role="class"} instances with the decimal.FloatOperation{.interpreted-text role="class"} trap set.

  • #13549: No longer evaluate type annotations in Python 3.14 when inspecting function signatures.

    This prevents crashes during module collection when modules do not explicitly use from __future__ import annotations and import types for annotations within a if TYPE_CHECKING: block.

  • #13559: Added missing [int]{.title-ref} and [float]{.title-ref} variants to the [Literal]{.title-ref} type annotation of the [type]{.title-ref} parameter in pytest.Parser.addini{.interpreted-text role="meth"}.

  • #13563: pytest.approx{.interpreted-text role="func"} now only imports numpy if NumPy is already in sys.modules. This fixes unconditional import behavior introduced in [8.4.0]{.title-ref}.

Improved documentation

  • #13577: Clarify that pytest_generate_tests is discovered in test modules/classes; other hooks must be in conftest.py or plugins.

Contributor-facing changes

  • #13480: Self-testing: fixed a few test failures when run with -Wdefault or a similar override.
  • #13547: Self-testing: corrected expected message for test_doctest_unexpected_exception in Python 3.14.
  • #13684: Make pytest's own testsuite insensitive to the presence of the CI environment variable -- by ogrisel{.interpreted-text role="user"}.
Commits
  • bfae422 Prepare release version 8.4.2
  • 8990538 Fix passenv CI in tox ini and make tests insensitive to the presence of the C...
  • ca676bf Merge pull request #13687 from pytest-dev/patchback/backports/8.4.x/e63f6e51c...
  • 975a60a Merge pull request #13686 from pytest-dev/patchback/backports/8.4.x/12bde8af6...
  • 7723ce8 Merge pull request #13683 from even-even/fix_Exeption_to_Exception_in_errorMe...
  • b7f0568 Merge pull request #13685 from CoretexShadow/fix/docs-pytest-generate-tests
  • 2c94c4a add missing colon (#13640) (#13641)
  • c3d7684 Merge pull request #13606 from pytest-dev/patchback/backports/8.4.x/5f9938563...
  • dc6e3be Merge pull request #13605 from The-Compiler/training-update-2025-07
  • f87289c Fix crash with times output style and skipped module (#13573) (#13579)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pytest&package-manager=uv&previous-version=8.4.1&new-version=8.4.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 2788c6fef..6f8ba7ad6 100644 --- a/uv.lock +++ b/uv.lock @@ -3540,7 +3540,7 @@ wheels = [ [[package]] name = "pytest" -version = "8.4.1" +version = "8.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -3549,9 +3549,9 @@ dependencies = [ { name = "pluggy" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, ] [[package]] From 369083c0699270d7a3fa4d10f4975a081fcc7acd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:17:28 -0700 Subject: [PATCH 08/30] chore(python-deps): bump locust from 2.39.1 to 2.40.1 (#3358) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [locust](https://github.com/locustio/locust) from 2.39.1 to 2.40.1.
Release notes

Sourced from locust's releases.

2.40.1

What's Changed

Full Changelog: https://github.com/locustio/locust/compare/2.40.0...2.40.1

2.40.0

What's Changed

New Contributors

Full Changelog: https://github.com/locustio/locust/compare/2.39.1...2.40.0

Changelog

Sourced from locust's changelog.

Detailed changelog

The most important changes can also be found in the documentation.

Commits
  • 5df19da Merge pull request #3205 from locustio/move-pytest-plugin-to-own-directory
  • d41141b Move pytest plugin to its own directory, to prevent accidental import of locu...
  • 6422848 mention that only one locustfile can be distributed
  • aa3da73 Merge pull request #3204 from locustio/delay-imports-in-pytest-plugin-to-avoi...
  • 12050de Pytest plugin: Delay imports to avoid monkey patching until someone actually ...
  • 488d1f8 docs
  • 439b7ab docs fix
  • fcd76a8 docs: rephrase
  • 70c7e9b docs: move pytest further up
  • 06dbf98 docs: fix link
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=locust&package-manager=uv&previous-version=2.39.1&new-version=2.40.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- uv.lock | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 6f8ba7ad6..df3a23e58 100644 --- a/uv.lock +++ b/uv.lock @@ -2023,7 +2023,7 @@ wheels = [ [[package]] name = "locust" -version = "2.39.1" +version = "2.40.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "configargparse" }, @@ -2035,6 +2035,7 @@ dependencies = [ { name = "locust-cloud" }, { name = "msgpack" }, { name = "psutil" }, + { name = "pytest" }, { name = "python-engineio" }, { name = "python-socketio", extra = ["client"] }, { name = "pywin32", marker = "sys_platform == 'win32'" }, @@ -2043,9 +2044,9 @@ dependencies = [ { name = "setuptools" }, { name = "werkzeug" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/95/c8/10aa5445c404eed389b56877e6714c1787190cc09dd70059ce3765979ec5/locust-2.39.1.tar.gz", hash = "sha256:6bdd19e27edf9a1c84391d6cf6e9a737dfb832be7dfbf39053191ae31b9cc498", size = 1409902, upload-time = "2025-08-29T17:41:01.544Z" } +sdist = { url = "https://files.pythonhosted.org/packages/01/22/82f40176473a98c9479bed667d3ad21bb859d2cb67f6880a6b0b6a725e45/locust-2.40.1.tar.gz", hash = "sha256:5bde76c1cf7e412071670f926f34844e119210c93f07a4cf9fc4cb93c60a578a", size = 1411606, upload-time = "2025-09-05T15:57:35.76Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ec/b3/b2f4b2ca88b1e72eba7be2b2982533b887f8b709d222db78eb9602aa5121/locust-2.39.1-py3-none-any.whl", hash = "sha256:fd5148f2f1a4ed34aee968abc4393674e69d1b5e1b54db50a397f6eb09ce0b04", size = 1428155, upload-time = "2025-08-29T17:41:00.245Z" }, + { url = "https://files.pythonhosted.org/packages/3b/e6/9c6335ab16becf4f8ad3da6083ab78793c56ec1ca496d6f7c74660c21c3f/locust-2.40.1-py3-none-any.whl", hash = "sha256:ef0517f9bb5ed0afa7035014faaf944802917e07da8649461aaaf5e5f3ba8a65", size = 1430154, upload-time = "2025-09-05T15:57:33.233Z" }, ] [[package]] From 438c037b1f16ee8123ab71b2aa39529ce32967a5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:17:43 -0700 Subject: [PATCH 09/30] chore(python-deps): bump openai from 1.102.0 to 1.106.1 (#3356) Bumps [openai](https://github.com/openai/openai-python) from 1.102.0 to 1.106.1.
Release notes

Sourced from openai's releases.

v1.106.1

1.106.1 (2025-09-04)

Full Changelog: v1.106.0...v1.106.1

Chores

  • internal: move mypy configurations to pyproject.toml file (ca413a2)

v1.106.0

1.106.0 (2025-09-04)

Full Changelog: v1.105.0...v1.106.0

Features

  • client: support callable api_key (#2588) (e1bad01)
  • improve future compat with pydantic v3 (6645d93)

v1.105.0

1.105.0 (2025-09-03)

Full Changelog: v1.104.2...v1.105.0

Features

  • api: Add gpt-realtime models (8502041)

v1.104.2

1.104.2 (2025-09-02)

Full Changelog: v1.104.1...v1.104.2

Bug Fixes

  • types: add aliases back for web search tool types (2521cd8)

v1.104.1

1.104.1 (2025-09-02)

Full Changelog: v1.104.0...v1.104.1

Chores

  • api: manual updates for ResponseInputAudio (0db5061)

v1.104.0

1.104.0 (2025-09-02)

Full Changelog: v1.103.0...v1.104.0

... (truncated)

Changelog

Sourced from openai's changelog.

1.106.1 (2025-09-04)

Full Changelog: v1.106.0...v1.106.1

Chores

  • internal: move mypy configurations to pyproject.toml file (ca413a2)

1.106.0 (2025-09-04)

Full Changelog: v1.105.0...v1.106.0

Features

  • client: support callable api_key (#2588) (e1bad01)
  • improve future compat with pydantic v3 (6645d93)

1.105.0 (2025-09-03)

Full Changelog: v1.104.2...v1.105.0

Features

  • api: Add gpt-realtime models (8502041)

1.104.2 (2025-09-02)

Full Changelog: v1.104.1...v1.104.2

Bug Fixes

  • types: add aliases back for web search tool types (2521cd8)

1.104.1 (2025-09-02)

Full Changelog: v1.104.0...v1.104.1

Chores

  • api: manual updates for ResponseInputAudio (0db5061)

1.104.0 (2025-09-02)

Full Changelog: v1.103.0...v1.104.0

Features

  • types: replace List[str] with SequenceNotStr in params (bc00bda)

... (truncated)

Commits
  • 2adf111 release: 1.106.1
  • c4f9d0b chore(internal): move mypy configurations to pyproject.toml file
  • 2de8d7c release: 1.106.0
  • 2cf4ed5 feat: improve future compat with pydantic v3
  • 25d16be feat(client): support callable api_key (#2588)
  • 8672413 release: 1.105.0
  • 2c60d78 feat(api): Add gpt-realtime models
  • a52463c release: 1.104.2
  • 5a6931d fix(types): add aliases back for web search tool types
  • fb152d9 release: 1.104.1
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=openai&package-manager=uv&previous-version=1.102.0&new-version=1.106.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index df3a23e58..3d7713f54 100644 --- a/uv.lock +++ b/uv.lock @@ -2620,7 +2620,7 @@ wheels = [ [[package]] name = "openai" -version = "1.102.0" +version = "1.107.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2632,9 +2632,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/07/55/da5598ed5c6bdd9939633854049cddc5cbac0da938dfcfcb3c6b119c16c0/openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9", size = 519027, upload-time = "2025-08-26T20:50:29.397Z" } +sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bd/0d/c9e7016d82c53c5b5e23e2bad36daebb8921ed44f69c0a985c6529a35106/openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345", size = 812015, upload-time = "2025-08-26T20:50:27.219Z" }, + { url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" }, ] [[package]] From d4e45cd5f1e099d9f6ac2d52ad6cd3f74cc4facf Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:18:14 -0700 Subject: [PATCH 10/30] chore(ui-deps): bump tailwindcss from 4.1.6 to 4.1.13 in /llama_stack/ui (#3362) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [tailwindcss](https://github.com/tailwindlabs/tailwindcss/tree/HEAD/packages/tailwindcss) from 4.1.6 to 4.1.13.
Release notes

Sourced from tailwindcss's releases.

v4.1.13

Changed

  • Drop warning from browser build (#18731)
  • Drop exact duplicate declarations when emitting CSS (#18809)

Fixed

  • Don't transition visibility when using transition (#18795)
  • Discard matched variants with unknown named values (#18799)
  • Discard matched variants with non-string values (#18799)
  • Show suggestions for known matchVariant values (#18798)
  • Replace deprecated clip with clip-path in sr-only (#18769)
  • Hide internal fields from completions in matchUtilities (#18820)
  • Ignore .vercel folders by default (can be overridden by @source … rules) (#18855)
  • Consider variants starting with @- to be invalid (e.g. @-2xl:flex) (#18869)
  • Do not allow custom variants to start or end with a - or _ (#18867, #18872)
  • Upgrade: Migrate aria theme keys to @custom-variant (#18815)
  • Upgrade: Migrate data theme keys to @custom-variant (#18816)
  • Upgrade: Migrate supports theme keys to @custom-variant (#18817)

v4.1.12

Fixed

  • Don't consider the global important state in @apply (#18404)
  • Add missing suggestions for flex-<number> utilities (#18642)
  • Fix trailing ) from interfering with extraction in Clojure keywords (#18345)
  • Detect classes inside Elixir charlist, word list, and string sigils (#18432)
  • Track source locations through @plugin and @config (#18345)
  • Allow boolean values of process.env.DEBUG in @tailwindcss/node (#18485)
  • Ignore consecutive semicolons in the CSS parser (#18532)
  • Center the dropdown icon added to an input with a paired datalist by default (#18511)
  • Extract candidates in Slang templates (#18565)
  • Improve error messages when encountering invalid functional utility names (#18568)
  • Discard CSS AST objects with false or undefined properties (#18571)
  • Allow users to disable URL rebasing in @tailwindcss/postcss via transformAssetUrls: false (#18321)
  • Fix false-positive migrations in addEventListener and JavaScript variable names (#18718)
  • Fix Standalone CLI showing default Bun help when run via symlink on Windows (#18723)
  • Read from --border-color-* theme keys in divide-* utilities for backwards compatibility (#18704)
  • Don't scan .hdr and .exr files for classes by default (#18734)

v4.1.11

Fixed

  • Add heuristic to skip candidate migrations inside emit(…) (#18330)
  • Extract candidates with variants in Clojure/ClojureScript keywords (#18338)
  • Document --watch=always in the CLI's usage (#18337)
  • Add support for Vite 7 to @tailwindcss/vite (#18384)

v4.1.10

... (truncated)

Changelog

Sourced from tailwindcss's changelog.

[4.1.13] - 2025-09-03

Changed

  • Drop warning from browser build (#18731)
  • Drop exact duplicate declarations when emitting CSS (#18809)

Fixed

  • Don't transition visibility when using transition (#18795)
  • Discard matched variants with unknown named values (#18799)
  • Discard matched variants with non-string values (#18799)
  • Show suggestions for known matchVariant values (#18798)
  • Replace deprecated clip with clip-path in sr-only (#18769)
  • Hide internal fields from completions in matchUtilities (#18820)
  • Ignore .vercel folders by default (can be overridden by @source … rules) (#18855)
  • Consider variants starting with @- to be invalid (e.g. @-2xl:flex) (#18869)
  • Do not allow custom variants to start or end with a - or _ (#18867, #18872)
  • Upgrade: Migrate aria theme keys to @custom-variant (#18815)
  • Upgrade: Migrate data theme keys to @custom-variant (#18816)
  • Upgrade: Migrate supports theme keys to @custom-variant (#18817)

[4.1.12] - 2025-08-13

Fixed

  • Don't consider the global important state in @apply (#18404)
  • Add missing suggestions for flex-<number> utilities (#18642)
  • Fix trailing ) from interfering with extraction in Clojure keywords (#18345)
  • Detect classes inside Elixir charlist, word list, and string sigils (#18432)
  • Track source locations through @plugin and @config (#18345)
  • Allow boolean values of process.env.DEBUG in @tailwindcss/node (#18485)
  • Ignore consecutive semicolons in the CSS parser (#18532)
  • Center the dropdown icon added to an input with a paired datalist by default (#18511)
  • Extract candidates in Slang templates (#18565)
  • Improve error messages when encountering invalid functional utility names (#18568)
  • Discard CSS AST objects with false or undefined properties (#18571)
  • Allow users to disable URL rebasing in @tailwindcss/postcss via transformAssetUrls: false (#18321)
  • Fix false-positive migrations in addEventListener and JavaScript variable names (#18718)
  • Fix Standalone CLI showing default Bun help when run via symlink on Windows (#18723)
  • Read from --border-color-* theme keys in divide-* utilities for backwards compatibility (#18704)
  • Don't scan .hdr and .exr files for classes by default (#18734)

[4.1.11] - 2025-06-26

Fixed

  • Add heuristic to skip candidate migrations inside emit(…) (#18330)
  • Extract candidates with variants in Clojure/ClojureScript keywords (#18338)
  • Document --watch=always in the CLI's usage (#18337)

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=tailwindcss&package-manager=npm_and_yarn&previous-version=4.1.6&new-version=4.1.13)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- llama_stack/ui/package-lock.json | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json index 1db1c61cd..e2c0815fd 100644 --- a/llama_stack/ui/package-lock.json +++ b/llama_stack/ui/package-lock.json @@ -3578,6 +3578,13 @@ "tailwindcss": "4.1.6" } }, + "node_modules/@tailwindcss/node/node_modules/tailwindcss": { + "version": "4.1.6", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", + "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", + "dev": true, + "license": "MIT" + }, "node_modules/@tailwindcss/oxide": { "version": "4.1.6", "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz", @@ -3838,6 +3845,13 @@ "tailwindcss": "4.1.6" } }, + "node_modules/@tailwindcss/postcss/node_modules/tailwindcss": { + "version": "4.1.6", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", + "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", + "dev": true, + "license": "MIT" + }, "node_modules/@testing-library/dom": { "version": "10.4.1", "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", @@ -13843,9 +13857,9 @@ } }, "node_modules/tailwindcss": { - "version": "4.1.6", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", - "integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", + "version": "4.1.13", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz", + "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==", "dev": true, "license": "MIT" }, From d2f88a10fb0cf366708ec106696c812b8c85629c Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 10 Sep 2025 13:19:36 -0700 Subject: [PATCH 11/30] chore: telemetry test (#3405) # What does this PR do? - removed fixed-duration sleeps ## Test Plan --- .../telemetry/test_openai_telemetry.py | 17 ++++++++--------- tests/integration/telemetry/test_telemetry.py | 5 +---- .../telemetry/test_telemetry_metrics.py | 5 +---- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/tests/integration/telemetry/test_openai_telemetry.py b/tests/integration/telemetry/test_openai_telemetry.py index cdd9b6702..b3ffb6b09 100644 --- a/tests/integration/telemetry/test_openai_telemetry.py +++ b/tests/integration/telemetry/test_openai_telemetry.py @@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id): traces = llama_stack_client.telemetry.query_traces(limit=10) if len(traces) >= 5: # 5 OpenAI completion traces break - time.sleep(1) + time.sleep(0.1) if len(traces) < 5: pytest.fail( f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces." ) - # Wait for 5 seconds to ensure traces has completed logging - time.sleep(5) - yield @@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id): assert len(response.choices) > 0, "Response should have at least one choice" # Wait for telemetry to be recorded - time.sleep(3) - - # Check that we have more traces now - final_traces = llama_stack_client.telemetry.query_traces(limit=20) - final_count = len(final_traces) + start_time = time.time() + while time.time() - start_time < 30: + final_traces = llama_stack_client.telemetry.query_traces(limit=20) + final_count = len(final_traces) + if final_count > initial_count: + break + time.sleep(0.1) # Should have at least as many traces as before (might have more due to other activity) assert final_count >= initial_count, "Should have at least as many traces after OpenAI call" diff --git a/tests/integration/telemetry/test_telemetry.py b/tests/integration/telemetry/test_telemetry.py index d363edbc0..e86da954e 100644 --- a/tests/integration/telemetry/test_telemetry.py +++ b/tests/integration/telemetry/test_telemetry.py @@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id): traces = llama_stack_client.telemetry.query_traces(limit=10) if len(traces) >= 4: break - time.sleep(1) + time.sleep(0.1) if len(traces) < 4: pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.") - # Wait for 5 seconds to ensure traces has completed logging - time.sleep(5) - yield diff --git a/tests/integration/telemetry/test_telemetry_metrics.py b/tests/integration/telemetry/test_telemetry_metrics.py index 4ba2bd2d9..1d8312ae2 100644 --- a/tests/integration/telemetry/test_telemetry_metrics.py +++ b/tests/integration/telemetry/test_telemetry_metrics.py @@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i break except Exception: pass - time.sleep(1) - - # Wait additional time to ensure all metrics are processed - time.sleep(5) + time.sleep(0.1) # Return the token lists for use in tests return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens} From c04f1c1e8c0b8c9df80ab51ce7379476cf218317 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 10 Sep 2025 13:19:44 -0700 Subject: [PATCH 12/30] chore: move benchmarking related code (#3406) # What does this PR do? - moving things and some formatting changes ## Test Plan --- .../k8s-benchmark/README.md | 4 +- .../k8s-benchmark/apply.sh | 0 .../k8s-benchmark/benchmark.py | 129 +++++++------ .../k8s-benchmark/openai-mock-server.py | 170 ++++++++++-------- .../k8s-benchmark/profile_running_server.sh | 0 .../k8s-benchmark/run-benchmark.sh | 0 .../k8s-benchmark/stack-configmap.yaml | 0 .../k8s-benchmark/stack-k8s.yaml.template | 0 .../k8s-benchmark/stack_run_config.yaml | 0 docs/source/contributing/index.md | 2 +- 10 files changed, 156 insertions(+), 149 deletions(-) rename {docs/source/distributions => benchmarking}/k8s-benchmark/README.md (98%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/apply.sh (100%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/benchmark.py (80%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/openai-mock-server.py (60%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/profile_running_server.sh (100%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/run-benchmark.sh (100%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack-configmap.yaml (100%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack-k8s.yaml.template (100%) rename {docs/source/distributions => benchmarking}/k8s-benchmark/stack_run_config.yaml (100%) diff --git a/docs/source/distributions/k8s-benchmark/README.md b/benchmarking/k8s-benchmark/README.md similarity index 98% rename from docs/source/distributions/k8s-benchmark/README.md rename to benchmarking/k8s-benchmark/README.md index 42da4d466..3b0d0c4db 100644 --- a/docs/source/distributions/k8s-benchmark/README.md +++ b/benchmarking/k8s-benchmark/README.md @@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati **1. Deploy base k8s infrastructure:** ```bash -cd ../k8s +cd ../../docs/source/distributions/k8s ./apply.sh ``` **2. Deploy benchmark components:** ```bash -cd ../k8s-benchmark ./apply.sh ``` @@ -56,7 +55,6 @@ kubectl get pods **Benchmark Llama Stack (default):** ```bash -cd docs/source/distributions/k8s-benchmark/ ./run-benchmark.sh ``` diff --git a/docs/source/distributions/k8s-benchmark/apply.sh b/benchmarking/k8s-benchmark/apply.sh similarity index 100% rename from docs/source/distributions/k8s-benchmark/apply.sh rename to benchmarking/k8s-benchmark/apply.sh diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/benchmarking/k8s-benchmark/benchmark.py similarity index 80% rename from docs/source/distributions/k8s-benchmark/benchmark.py rename to benchmarking/k8s-benchmark/benchmark.py index 83ba9602a..d5e34aa23 100644 --- a/docs/source/distributions/k8s-benchmark/benchmark.py +++ b/benchmarking/k8s-benchmark/benchmark.py @@ -14,7 +14,7 @@ import os import random import statistics import time -from typing import Tuple + import aiohttp @@ -55,50 +55,50 @@ class BenchmarkStats: total_time = self.end_time - self.start_time success_rate = (self.success_count / self.total_requests) * 100 - - print(f"\n{'='*60}") - print(f"BENCHMARK RESULTS") - - print(f"\nResponse Time Statistics:") + + print(f"\n{'=' * 60}") + print("BENCHMARK RESULTS") + + print("\nResponse Time Statistics:") print(f" Mean: {statistics.mean(self.response_times):.3f}s") print(f" Median: {statistics.median(self.response_times):.3f}s") print(f" Min: {min(self.response_times):.3f}s") print(f" Max: {max(self.response_times):.3f}s") - + if len(self.response_times) > 1: print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s") - + percentiles = [50, 90, 95, 99] sorted_times = sorted(self.response_times) - print(f"\nPercentiles:") + print("\nPercentiles:") for p in percentiles: idx = int(len(sorted_times) * p / 100) - 1 idx = max(0, min(idx, len(sorted_times) - 1)) print(f" P{p}: {sorted_times[idx]:.3f}s") - + if self.ttft_times: - print(f"\nTime to First Token (TTFT) Statistics:") + print("\nTime to First Token (TTFT) Statistics:") print(f" Mean: {statistics.mean(self.ttft_times):.3f}s") print(f" Median: {statistics.median(self.ttft_times):.3f}s") print(f" Min: {min(self.ttft_times):.3f}s") print(f" Max: {max(self.ttft_times):.3f}s") - + if len(self.ttft_times) > 1: print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s") - + sorted_ttft = sorted(self.ttft_times) - print(f"\nTTFT Percentiles:") + print("\nTTFT Percentiles:") for p in percentiles: idx = int(len(sorted_ttft) * p / 100) - 1 idx = max(0, min(idx, len(sorted_ttft) - 1)) print(f" P{p}: {sorted_ttft[idx]:.3f}s") - + if self.chunks_received: - print(f"\nStreaming Statistics:") + print("\nStreaming Statistics:") print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}") print(f" Total chunks received: {sum(self.chunks_received)}") - - print(f"{'='*60}") + + print(f"{'=' * 60}") print(f"Total time: {total_time:.2f}s") print(f"Concurrent users: {self.concurrent_users}") print(f"Total requests: {self.total_requests}") @@ -106,16 +106,16 @@ class BenchmarkStats: print(f"Failed requests: {len(self.errors)}") print(f"Success rate: {success_rate:.1f}%") print(f"Requests per second: {self.success_count / total_time:.2f}") - + if self.errors: - print(f"\nErrors (showing first 5):") + print("\nErrors (showing first 5):") for error in self.errors[:5]: print(f" {error}") class LlamaStackBenchmark: def __init__(self, base_url: str, model_id: str): - self.base_url = base_url.rstrip('/') + self.base_url = base_url.rstrip("/") self.model_id = model_id self.headers = {"Content-Type": "application/json"} self.test_messages = [ @@ -126,74 +126,67 @@ class LlamaStackBenchmark: [ {"role": "user", "content": "What is machine learning?"}, {"role": "assistant", "content": "Machine learning is a subset of AI..."}, - {"role": "user", "content": "Can you give me a practical example?"} - ] + {"role": "user", "content": "Can you give me a practical example?"}, + ], ] - - async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]: + async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]: """Make a single async streaming chat completion request.""" messages = random.choice(self.test_messages) - payload = { - "model": self.model_id, - "messages": messages, - "stream": True, - "max_tokens": 100 - } - + payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100} + start_time = time.time() chunks_received = 0 ttft = None error = None - + session = aiohttp.ClientSession() - + try: async with session.post( f"{self.base_url}/chat/completions", headers=self.headers, json=payload, - timeout=aiohttp.ClientTimeout(total=30) + timeout=aiohttp.ClientTimeout(total=30), ) as response: if response.status == 200: async for line in response.content: if line: - line_str = line.decode('utf-8').strip() - if line_str.startswith('data: '): + line_str = line.decode("utf-8").strip() + if line_str.startswith("data: "): chunks_received += 1 if ttft is None: ttft = time.time() - start_time - if line_str == 'data: [DONE]': + if line_str == "data: [DONE]": break - + if chunks_received == 0: error = "No streaming chunks received" else: text = await response.text() error = f"HTTP {response.status}: {text[:100]}" - + except Exception as e: error = f"Request error: {str(e)}" finally: await session.close() - + response_time = time.time() - start_time return response_time, chunks_received, ttft, error - async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats: """Run benchmark using async requests for specified duration.""" stats = BenchmarkStats() stats.concurrent_users = concurrent_users stats.start_time = time.time() - + print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users") print(f"Target URL: {self.base_url}/chat/completions") print(f"Model: {self.model_id}") - + connector = aiohttp.TCPConnector(limit=concurrent_users) - async with aiohttp.ClientSession(connector=connector) as session: - + async with aiohttp.ClientSession(connector=connector): + async def worker(worker_id: int): """Worker that sends requests sequentially until canceled.""" request_count = 0 @@ -202,12 +195,12 @@ class LlamaStackBenchmark: response_time, chunks, ttft, error = await self.make_async_streaming_request() await stats.add_result(response_time, chunks, ttft, error) request_count += 1 - + except asyncio.CancelledError: break except Exception as e: await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}") - + # Progress reporting task async def progress_reporter(): last_report_time = time.time() @@ -216,48 +209,52 @@ class LlamaStackBenchmark: await asyncio.sleep(1) # Report every second if time.time() >= last_report_time + 10: # Report every 10 seconds elapsed = time.time() - stats.start_time - print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}") + print( + f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}" + ) last_report_time = time.time() except asyncio.CancelledError: break - + # Spawn concurrent workers tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)] progress_task = asyncio.create_task(progress_reporter()) tasks.append(progress_task) - + # Wait for duration then cancel all tasks await asyncio.sleep(duration) - + for task in tasks: task.cancel() - + # Wait for all tasks to complete await asyncio.gather(*tasks, return_exceptions=True) - + stats.end_time = time.time() return stats def main(): parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool") - parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), - help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)") - parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"), - help="Model ID to use for requests") - parser.add_argument("--duration", type=int, default=60, - help="Duration in seconds to run benchmark (default: 60)") - parser.add_argument("--concurrent", type=int, default=10, - help="Number of concurrent users (default: 10)") - + parser.add_argument( + "--base-url", + default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), + help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)", + ) + parser.add_argument( + "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests" + ) + parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)") + parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)") + args = parser.parse_args() - + benchmark = LlamaStackBenchmark(args.base_url, args.model) - + try: stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent)) stats.print_summary() - + except KeyboardInterrupt: print("\nBenchmark interrupted by user") except Exception as e: diff --git a/docs/source/distributions/k8s-benchmark/openai-mock-server.py b/benchmarking/k8s-benchmark/openai-mock-server.py similarity index 60% rename from docs/source/distributions/k8s-benchmark/openai-mock-server.py rename to benchmarking/k8s-benchmark/openai-mock-server.py index de0680842..9e898af8e 100755 --- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py +++ b/benchmarking/k8s-benchmark/openai-mock-server.py @@ -11,180 +11,192 @@ OpenAI-compatible mock server that returns: - Valid OpenAI-formatted chat completion responses with dynamic content """ -from flask import Flask, request, jsonify, Response -import time -import random -import uuid -import json import argparse +import json import os +import random +import time +import uuid + +from flask import Flask, Response, jsonify, request app = Flask(__name__) + # Models from environment variables def get_models(): models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct") model_ids = [m.strip() for m in models_str.split(",") if m.strip()] - + return { "object": "list", "data": [ - { - "id": model_id, - "object": "model", - "created": 1234567890, - "owned_by": "vllm" - } - for model_id in model_ids - ] + {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids + ], } + def generate_random_text(length=50): """Generate random but coherent text for responses.""" words = [ - "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you", - "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what", - "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist", - "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more" + "Hello", + "there", + "I'm", + "an", + "AI", + "assistant", + "ready", + "to", + "help", + "you", + "with", + "your", + "questions", + "and", + "tasks", + "today", + "Let", + "me", + "know", + "what", + "you'd", + "like", + "to", + "discuss", + "or", + "explore", + "together", + "I", + "can", + "assist", + "with", + "various", + "topics", + "including", + "coding", + "writing", + "analysis", + "and", + "more", ] return " ".join(random.choices(words, k=length)) -@app.route('/v1/models', methods=['GET']) + +@app.route("/v1/models", methods=["GET"]) def list_models(): models = get_models() print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}") return jsonify(models) -@app.route('/v1/chat/completions', methods=['POST']) + +@app.route("/v1/chat/completions", methods=["POST"]) def chat_completions(): """Return OpenAI-formatted chat completion responses.""" data = request.get_json() - default_model = get_models()['data'][0]['id'] - model = data.get('model', default_model) - messages = data.get('messages', []) - stream = data.get('stream', False) - + default_model = get_models()["data"][0]["id"] + model = data.get("model", default_model) + messages = data.get("messages", []) + stream = data.get("stream", False) + print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}") - + if stream: return handle_streaming_completion(model, messages) else: return handle_non_streaming_completion(model, messages) + def handle_non_streaming_completion(model, messages): response_text = generate_random_text(random.randint(20, 80)) - + # Calculate realistic token counts - prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages) + prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages) completion_tokens = len(response_text.split()) - + response = { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "object": "chat.completion", "created": int(time.time()), "model": model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": response_text - }, - "finish_reason": "stop" - } - ], + "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}], "usage": { "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - } + "total_tokens": prompt_tokens + completion_tokens, + }, } - + return jsonify(response) + def handle_streaming_completion(model, messages): def generate_stream(): # Generate response text full_response = generate_random_text(random.randint(30, 100)) words = full_response.split() - + # Send initial chunk initial_chunk = { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "object": "chat.completion.chunk", "created": int(time.time()), "model": model, - "choices": [ - { - "index": 0, - "delta": {"role": "assistant", "content": ""} - } - ] + "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}], } yield f"data: {json.dumps(initial_chunk)}\n\n" - + # Send word by word for i, word in enumerate(words): chunk = { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", - "object": "chat.completion.chunk", + "object": "chat.completion.chunk", "created": int(time.time()), "model": model, - "choices": [ - { - "index": 0, - "delta": {"content": f"{word} " if i < len(words) - 1 else word} - } - ] + "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}], } yield f"data: {json.dumps(chunk)}\n\n" # Configurable delay to simulate realistic streaming stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005")) time.sleep(stream_delay) - + # Send final chunk final_chunk = { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "object": "chat.completion.chunk", "created": int(time.time()), "model": model, - "choices": [ - { - "index": 0, - "delta": {"content": ""}, - "finish_reason": "stop" - } - ] + "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}], } yield f"data: {json.dumps(final_chunk)}\n\n" yield "data: [DONE]\n\n" - + return Response( generate_stream(), - mimetype='text/event-stream', + mimetype="text/event-stream", headers={ - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - 'Access-Control-Allow-Origin': '*', - } + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Access-Control-Allow-Origin": "*", + }, ) -@app.route('/health', methods=['GET']) + +@app.route("/health", methods=["GET"]) def health(): return jsonify({"status": "healthy", "type": "openai-mock"}) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='OpenAI-compatible mock server') - parser.add_argument('--port', type=int, default=8081, - help='Port to run the server on (default: 8081)') + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="OpenAI-compatible mock server") + parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)") args = parser.parse_args() - + port = args.port - + models = get_models() print("Starting OpenAI-compatible mock server...") print(f"- /models endpoint with: {[m['id'] for m in models['data']]}") print("- OpenAI-formatted chat/completion responses with dynamic content") print("- Streaming support with valid SSE format") print(f"- Listening on: http://0.0.0.0:{port}") - app.run(host='0.0.0.0', port=port, debug=False) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/docs/source/distributions/k8s-benchmark/profile_running_server.sh b/benchmarking/k8s-benchmark/profile_running_server.sh similarity index 100% rename from docs/source/distributions/k8s-benchmark/profile_running_server.sh rename to benchmarking/k8s-benchmark/profile_running_server.sh diff --git a/docs/source/distributions/k8s-benchmark/run-benchmark.sh b/benchmarking/k8s-benchmark/run-benchmark.sh similarity index 100% rename from docs/source/distributions/k8s-benchmark/run-benchmark.sh rename to benchmarking/k8s-benchmark/run-benchmark.sh diff --git a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml b/benchmarking/k8s-benchmark/stack-configmap.yaml similarity index 100% rename from docs/source/distributions/k8s-benchmark/stack-configmap.yaml rename to benchmarking/k8s-benchmark/stack-configmap.yaml diff --git a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template b/benchmarking/k8s-benchmark/stack-k8s.yaml.template similarity index 100% rename from docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template rename to benchmarking/k8s-benchmark/stack-k8s.yaml.template diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/benchmarking/k8s-benchmark/stack_run_config.yaml similarity index 100% rename from docs/source/distributions/k8s-benchmark/stack_run_config.yaml rename to benchmarking/k8s-benchmark/stack_run_config.yaml diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md index 1846f4d97..71c3bd5a6 100644 --- a/docs/source/contributing/index.md +++ b/docs/source/contributing/index.md @@ -35,5 +35,5 @@ testing/record-replay ### Benchmarking -```{include} ../../../docs/source/distributions/k8s-benchmark/README.md +```{include} ../../../benchmarking/k8s-benchmark/README.md ``` From 0c7f49490cdb6ff757659469d1401b515ac4402c Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 10 Sep 2025 14:34:18 -0700 Subject: [PATCH 13/30] fix(inference_store): on duplicate chat completion IDs, replace (#3408) # What does this PR do? Duplicate chat completion IDs can be generated during tests especially if they are replaying recorded responses across different tests. No need to warn or error under those circumstances. In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem. --- .../utils/inference/inference_store.py | 48 +++++++++++++++---- .../utils/sqlstore/authorized_sqlstore.py | 14 ++++++ 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py index 8c69b1683..17f4c6268 100644 --- a/llama_stack/providers/utils/inference/inference_store.py +++ b/llama_stack/providers/utils/inference/inference_store.py @@ -6,6 +6,8 @@ import asyncio from typing import Any +from sqlalchemy.exc import IntegrityError + from llama_stack.apis.inference import ( ListOpenAIChatCompletionResponse, OpenAIChatCompletion, @@ -129,16 +131,44 @@ class InferenceStore: raise ValueError("Inference store is not initialized") data = chat_completion.model_dump() + record_data = { + "id": data["id"], + "created": data["created"], + "model": data["model"], + "choices": data["choices"], + "input_messages": [message.model_dump() for message in input_messages], + } - await self.sql_store.insert( - table="chat_completions", - data={ - "id": data["id"], - "created": data["created"], - "model": data["model"], - "choices": data["choices"], - "input_messages": [message.model_dump() for message in input_messages], - }, + try: + await self.sql_store.insert( + table="chat_completions", + data=record_data, + ) + except IntegrityError as e: + # Duplicate chat completion IDs can be generated during tests especially if they are replaying + # recorded responses across different tests. No need to warn or error under those circumstances. + # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem. + + # Check if it's a unique constraint violation + error_message = str(e.orig) if e.orig else str(e) + if self._is_unique_constraint_error(error_message): + # Update the existing record instead + await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]}) + else: + # Re-raise if it's not a unique constraint error + raise + + def _is_unique_constraint_error(self, error_message: str) -> bool: + """Check if the error is specifically a unique constraint violation.""" + error_lower = error_message.lower() + return any( + indicator in error_lower + for indicator in [ + "unique constraint failed", # SQLite + "duplicate key", # PostgreSQL + "unique violation", # PostgreSQL alternative + "duplicate entry", # MySQL + ] ) async def list_chat_completions( diff --git a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py index 867ba2f55..acb688f96 100644 --- a/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +++ b/llama_stack/providers/utils/sqlstore/authorized_sqlstore.py @@ -172,6 +172,20 @@ class AuthorizedSqlStore: return results.data[0] if results.data else None + async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None: + """Update rows with automatic access control attribute capture.""" + enhanced_data = dict(data) + + current_user = get_authenticated_user() + if current_user: + enhanced_data["owner_principal"] = current_user.principal + enhanced_data["access_attributes"] = current_user.attributes + else: + enhanced_data["owner_principal"] = None + enhanced_data["access_attributes"] = None + + await self.sql_store.update(table, enhanced_data, where) + async def delete(self, table: str, where: Mapping[str, Any]) -> None: """Delete rows with automatic access control filtering.""" await self.sql_store.delete(table, where) From 8e05c68d159a40d54768a9473d63b68a5bfbf369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 11 Sep 2025 10:19:59 +0200 Subject: [PATCH 14/30] chore: remove openai dependency from providers (#3398) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? The openai package is already a dependency of the llama-stack project itself, so let's the project dictate which openai version we need and avoid potential breakage with unsatisfiable dependency resolution. Signed-off-by: Sébastien Han --- llama_stack/providers/registry/batches.py | 2 +- llama_stack/providers/registry/inference.py | 20 ++++++++------------ llama_stack/providers/registry/scoring.py | 2 +- pyproject.toml | 4 +--- uv.lock | 8 ++------ 5 files changed, 13 insertions(+), 23 deletions(-) diff --git a/llama_stack/providers/registry/batches.py b/llama_stack/providers/registry/batches.py index de7886efb..a07942486 100644 --- a/llama_stack/providers/registry/batches.py +++ b/llama_stack/providers/registry/batches.py @@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]: InlineProviderSpec( api=Api.batches, provider_type="inline::reference", - pip_packages=["openai"], + pip_packages=[], module="llama_stack.providers.inline.batches.reference", config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig", api_dependencies=[ diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 541fbb432..8912560cb 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="vllm", - pip_packages=["openai"], + pip_packages=[], module="llama_stack.providers.remote.inference.vllm", config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig", description="Remote vLLM inference provider for connecting to vLLM servers.", @@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="databricks", - pip_packages=[ - "openai", - ], + pip_packages=[], module="llama_stack.providers.remote.inference.databricks", config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig", description="Databricks inference provider for running models on Databricks' unified analytics platform.", @@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="nvidia", - pip_packages=[ - "openai", - ], + pip_packages=[], module="llama_stack.providers.remote.inference.nvidia", config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig", description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.", @@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="runpod", - pip_packages=["openai"], + pip_packages=[], module="llama_stack.providers.remote.inference.runpod", config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig", description="RunPod inference provider for running models on RunPod's cloud GPU platform.", @@ -207,7 +203,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="gemini", - pip_packages=["litellm", "openai"], + pip_packages=["litellm"], module="llama_stack.providers.remote.inference.gemini", config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig", provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator", @@ -218,7 +214,7 @@ def available_providers() -> list[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="vertexai", - pip_packages=["litellm", "google-cloud-aiplatform", "openai"], + pip_packages=["litellm", "google-cloud-aiplatform"], module="llama_stack.providers.remote.inference.vertexai", config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig", provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator", @@ -248,7 +244,7 @@ Available Models: api=Api.inference, adapter=AdapterSpec( adapter_type="groq", - pip_packages=["litellm", "openai"], + pip_packages=["litellm"], module="llama_stack.providers.remote.inference.groq", config_class="llama_stack.providers.remote.inference.groq.GroqConfig", provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator", @@ -270,7 +266,7 @@ Available Models: api=Api.inference, adapter=AdapterSpec( adapter_type="sambanova", - pip_packages=["litellm", "openai"], + pip_packages=["litellm"], module="llama_stack.providers.remote.inference.sambanova", config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig", provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator", diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py index 79293d888..a4ec54ed2 100644 --- a/llama_stack/providers/registry/scoring.py +++ b/llama_stack/providers/registry/scoring.py @@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]: InlineProviderSpec( api=Api.scoring, provider_type="inline::braintrust", - pip_packages=["autoevals", "openai"], + pip_packages=["autoevals"], module="llama_stack.providers.inline.scoring.braintrust", config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", api_dependencies=[ diff --git a/pyproject.toml b/pyproject.toml index 0414aafb0..72c4f6f9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "jinja2>=3.1.6", "jsonschema", "llama-stack-client>=0.2.21", - "openai>=1.99.6", + "openai>=1.100.0", # for expires_after support "prompt-toolkit", "python-dotenv", "python-jose[cryptography]", @@ -80,7 +80,6 @@ dev = [ unit = [ "sqlite-vec", "ollama", - "openai", "aiosqlite", "aiohttp", "psycopg2-binary>=2.9.0", @@ -105,7 +104,6 @@ unit = [ # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra # dependencies. test = [ - "openai>=1.100.0", # for expires_after support "aiosqlite", "aiohttp", "torch>=2.6.0", diff --git a/uv.lock b/uv.lock index 3d7713f54..065eb3876 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.12" resolution-markers = [ "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", @@ -1839,7 +1839,6 @@ test = [ { name = "datasets" }, { name = "mcp" }, { name = "milvus-lite" }, - { name = "openai" }, { name = "psycopg2-binary" }, { name = "pymilvus" }, { name = "pypdf" }, @@ -1865,7 +1864,6 @@ unit = [ { name = "milvus-lite" }, { name = "moto", extra = ["s3"] }, { name = "ollama" }, - { name = "openai" }, { name = "psycopg2-binary" }, { name = "pymilvus" }, { name = "pypdf" }, @@ -1889,7 +1887,7 @@ requires-dist = [ { name = "jsonschema" }, { name = "llama-stack-client", specifier = ">=0.2.21" }, { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.21" }, - { name = "openai", specifier = ">=1.99.6" }, + { name = "openai", specifier = ">=1.100.0" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" }, { name = "opentelemetry-sdk", specifier = ">=1.30.0" }, { name = "pandas", marker = "extra == 'ui'" }, @@ -1959,7 +1957,6 @@ test = [ { name = "datasets", specifier = ">=4.0.0" }, { name = "mcp" }, { name = "milvus-lite", specifier = ">=2.5.0" }, - { name = "openai", specifier = ">=1.100.0" }, { name = "psycopg2-binary", specifier = ">=2.9.0" }, { name = "pymilvus", specifier = ">=2.6.1" }, { name = "pypdf" }, @@ -1984,7 +1981,6 @@ unit = [ { name = "milvus-lite", specifier = ">=2.5.0" }, { name = "moto", extras = ["s3"], specifier = ">=5.1.10" }, { name = "ollama" }, - { name = "openai" }, { name = "psycopg2-binary", specifier = ">=2.9.0" }, { name = "pymilvus", specifier = ">=2.6.1" }, { name = "pypdf" }, From 2838d5a20f888c9f8fad666272dd9ca8d3bb4884 Mon Sep 17 00:00:00 2001 From: Sumanth Kamenani Date: Thu, 11 Sep 2025 05:41:53 -0400 Subject: [PATCH 15/30] fix: AWS Bedrock inference profile ID conversion for region-specific endpoints (#3386) Fixes #3370 AWS switched to requiring region-prefixed inference profile IDs instead of foundation model IDs for on-demand throughput. This was causing ValidationException errors. Added auto-detection based on boto3 client region to convert model IDs like meta.llama3-1-70b-instruct-v1:0 to us.meta.llama3-1-70b-instruct-v1:0 depending on the detected region. Also handles edge cases like ARNs, case insensitive regions, and None regions. Tested with this request. ```json { "model_id": "meta.llama3-1-8b-instruct-v1:0", "messages": [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "tell me a riddle" } ], "sampling_params": { "strategy": { "type": "top_p", "temperature": 0.7, "top_p": 0.9 }, "max_tokens": 512 } } ``` image --- .../remote/inference/bedrock/bedrock.py | 51 +++++++++++++++++- tests/unit/providers/test_bedrock.py | 53 +++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 tests/unit/providers/test_bedrock.py diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 63ea196f6..106caed9b 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -53,6 +53,43 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( from .models import MODEL_ENTRIES +REGION_PREFIX_MAP = { + "us": "us.", + "eu": "eu.", + "ap": "ap.", +} + + +def _get_region_prefix(region: str | None) -> str: + # AWS requires region prefixes for inference profiles + if region is None: + return "us." # default to US when we don't know + + # Handle case insensitive region matching + region_lower = region.lower() + for prefix in REGION_PREFIX_MAP: + if region_lower.startswith(f"{prefix}-"): + return REGION_PREFIX_MAP[prefix] + + # Fallback to US for anything we don't recognize + return "us." + + +def _to_inference_profile_id(model_id: str, region: str = None) -> str: + # Return ARNs unchanged + if model_id.startswith("arn:"): + return model_id + + # Return inference profile IDs that already have regional prefixes + if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()): + return model_id + + # Default to US East when no region is provided + if region is None: + region = "us-east-1" + + return _get_region_prefix(region) + model_id + class BedrockInferenceAdapter( ModelRegistryHelper, @@ -166,8 +203,13 @@ class BedrockInferenceAdapter( options["repetition_penalty"] = sampling_params.repetition_penalty prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model)) + + # Convert foundation model ID to inference profile ID + region_name = self.client.meta.region_name + inference_profile_id = _to_inference_profile_id(bedrock_model, region_name) + return { - "modelId": bedrock_model, + "modelId": inference_profile_id, "body": json.dumps( { "prompt": prompt, @@ -185,6 +227,11 @@ class BedrockInferenceAdapter( task_type: EmbeddingTaskType | None = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) + + # Convert foundation model ID to inference profile ID + region_name = self.client.meta.region_name + inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name) + embeddings = [] for content in contents: assert not content_has_media(content), "Bedrock does not support media for embeddings" @@ -193,7 +240,7 @@ class BedrockInferenceAdapter( body = json.dumps(input_body) response = self.client.invoke_model( body=body, - modelId=model.provider_resource_id, + modelId=inference_profile_id, accept="application/json", contentType="application/json", ) diff --git a/tests/unit/providers/test_bedrock.py b/tests/unit/providers/test_bedrock.py new file mode 100644 index 000000000..1ff07bbbe --- /dev/null +++ b/tests/unit/providers/test_bedrock.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.bedrock.bedrock import ( + _get_region_prefix, + _to_inference_profile_id, +) + + +def test_region_prefixes(): + assert _get_region_prefix("us-east-1") == "us." + assert _get_region_prefix("eu-west-1") == "eu." + assert _get_region_prefix("ap-south-1") == "ap." + assert _get_region_prefix("ca-central-1") == "us." + + # Test case insensitive + assert _get_region_prefix("US-EAST-1") == "us." + assert _get_region_prefix("EU-WEST-1") == "eu." + assert _get_region_prefix("Ap-South-1") == "ap." + + # Test None region + assert _get_region_prefix(None) == "us." + + +def test_model_id_conversion(): + # Basic conversion + assert ( + _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "us-east-1") == "us.meta.llama3-1-70b-instruct-v1:0" + ) + + # Already has prefix + assert ( + _to_inference_profile_id("us.meta.llama3-1-70b-instruct-v1:0", "us-east-1") + == "us.meta.llama3-1-70b-instruct-v1:0" + ) + + # ARN should be returned unchanged + arn = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/us.meta.llama3-1-70b-instruct-v1:0" + assert _to_inference_profile_id(arn, "us-east-1") == arn + + # ARN should be returned unchanged even without region + assert _to_inference_profile_id(arn) == arn + + # Optional region parameter defaults to us-east-1 + assert _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0") == "us.meta.llama3-1-70b-instruct-v1:0" + + # Different regions work with optional parameter + assert ( + _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "eu-west-1") == "eu.meta.llama3-1-70b-instruct-v1:0" + ) From c2d281e01b360ba0a2db177b90df6e7ba4df8501 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Thu, 11 Sep 2025 07:48:19 -0400 Subject: [PATCH 16/30] chore(replay): improve replay robustness with un-validated construction (#3414) # What does this PR do? some providers do not produce spec compliant outputs. when this happens the replay infra will fail to construct the proper types and will return a dict to the client. the client likely does not expect a dict. this was discovered with tgi, which returns finish_reason="" when valid values are "stop", "length" or "content_filter" ## Test Plan ci --- llama_stack/testing/inference_recorder.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index 298758c92..e78f493a6 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -105,8 +105,12 @@ def _deserialize_response(data: dict[str, Any]) -> Any: return cls.model_validate(data["__data__"]) except (ImportError, AttributeError, TypeError, ValueError) as e: - logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}") - return data["__data__"] + logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}") + try: + return cls.model_construct(**data["__data__"]) + except Exception as e: + logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}") + return data["__data__"] return data From f31bcc11bc9e4a88ce82dadafea8d4b0cb5f7230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 11 Sep 2025 13:48:38 +0200 Subject: [PATCH 17/30] feat: add Azure OpenAI inference provider support (#3396) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Llama-stack now supports a new OpenAI compatible endpoint with Azure OpenAI. The starter distro has been updated to add the new remote inference provider. A few tests have been modified and improved. ## Test Plan Deploy a model in the Aure portal then: ``` $ AZURE_API_KEY=... AZURE_API_BASE=... uv run llama stack build --image-type venv --providers inference=remote::azure --run ... $ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model azure/gpt-4.1 tests/integration/inference/test_openai_completion.py ... Results: ``` ============================================= test session starts ============================================== platform darwin -- Python 3.12.8, pytest-8.4.1, pluggy-1.6.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.12.8', 'Platform': 'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1', 'pluggy': '1.6.0'}, 'Plugins': {'anyio': '4.9.0', 'html': '4.1.1', 'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0', 'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval': '0.11.0', 'hydra-core': '1.3.2'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0, json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1, nbval-0.11.0, hydra-core-1.3.2 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 27 items tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 3%] tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=azure/gpt-5-mini-inference:completion:suffix] SKIPPED [ 7%] tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 11%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-1] SKIPPED [ 14%] tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=azure/gpt-5-mini] SKIPPED [ 18%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 22%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 25%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 29%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 33%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 37%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=azure/gpt-5-mini] SKIPPEDed files.) [ 40%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-0] SKIPPED [ 44%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 48%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 51%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 55%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 59%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 62%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 66%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 70%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 74%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 77%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 81%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 85%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 88%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 92%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-False] PASSED [ 96%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-False] PASSED [100%] =========================================== short test summary info ============================================ SKIPPED [3] tests/integration/inference/test_openai_completion.py:63: Model azure/gpt-5-mini hosted by remote::azure doesn't support OpenAI completions. SKIPPED [3] tests/integration/inference/test_openai_completion.py:118: Model azure/gpt-5-mini hosted by remote::azure doesn't support vllm extra_body parameters. SKIPPED [1] tests/integration/inference/test_openai_completion.py:124: Model azure/gpt-5-mini hosted by remote::azure doesn't support chat completion calls with base64 encoded files. ================================== 20 passed, 7 skipped, 2 warnings in 51.77s ================================== ``` Signed-off-by: Sébastien Han --- docs/source/providers/inference/index.md | 1 + .../providers/inference/remote_azure.md | 29 + llama_stack/distributions/ci-tests/build.yaml | 1 + llama_stack/distributions/ci-tests/run.yaml | 7 + .../distributions/starter-gpu/build.yaml | 1 + .../distributions/starter-gpu/run.yaml | 7 + llama_stack/distributions/starter/build.yaml | 1 + llama_stack/distributions/starter/run.yaml | 7 + llama_stack/distributions/starter/starter.py | 18 + llama_stack/providers/registry/inference.py | 15 + .../remote/inference/azure/__init__.py | 15 + .../providers/remote/inference/azure/azure.py | 64 + .../remote/inference/azure/config.py | 63 + .../remote/inference/azure/models.py | 28 + .../inference/test_openai_completion.py | 53 +- .../inference/test_text_inference.py | 3 +- .../recordings/responses/0fda25b9241c.json | 71 + .../recordings/responses/2b2ad549510d.json | 448 ++++ .../recordings/responses/57b67d1b1a36.json | 71 + .../recordings/responses/8752115f8d0c.json | 71 + .../recordings/responses/94d11daee205.json | 1178 +++++++++ .../recordings/responses/9f3d749cc1c8.json | 1150 +++++++++ .../recordings/responses/c791119e6359.json | 98 + .../recordings/responses/d3e27b7234e2.json | 2150 +++++++++++++++++ .../recordings/responses/fb785db7fafd.json | 310 +++ .../recordings/responses/ff3271401fb4.json | 556 +++++ 26 files changed, 6403 insertions(+), 13 deletions(-) create mode 100644 docs/source/providers/inference/remote_azure.md create mode 100644 llama_stack/providers/remote/inference/azure/__init__.py create mode 100644 llama_stack/providers/remote/inference/azure/azure.py create mode 100644 llama_stack/providers/remote/inference/azure/config.py create mode 100644 llama_stack/providers/remote/inference/azure/models.py create mode 100644 tests/integration/recordings/responses/0fda25b9241c.json create mode 100644 tests/integration/recordings/responses/2b2ad549510d.json create mode 100644 tests/integration/recordings/responses/57b67d1b1a36.json create mode 100644 tests/integration/recordings/responses/8752115f8d0c.json create mode 100644 tests/integration/recordings/responses/94d11daee205.json create mode 100644 tests/integration/recordings/responses/9f3d749cc1c8.json create mode 100644 tests/integration/recordings/responses/c791119e6359.json create mode 100644 tests/integration/recordings/responses/d3e27b7234e2.json create mode 100644 tests/integration/recordings/responses/fb785db7fafd.json create mode 100644 tests/integration/recordings/responses/ff3271401fb4.json diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md index b6d215474..c5720daef 100644 --- a/docs/source/providers/inference/index.md +++ b/docs/source/providers/inference/index.md @@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere inline_meta-reference inline_sentence-transformers remote_anthropic +remote_azure remote_bedrock remote_cerebras remote_databricks diff --git a/docs/source/providers/inference/remote_azure.md b/docs/source/providers/inference/remote_azure.md new file mode 100644 index 000000000..19f8f418b --- /dev/null +++ b/docs/source/providers/inference/remote_azure.md @@ -0,0 +1,29 @@ +# remote::azure + +## Description + + +Azure OpenAI inference provider for accessing GPT models and other Azure services. +Provider documentation +https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview + + +## Configuration + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `api_key` | `` | No | | Azure API key for Azure | +| `api_base` | `` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) | +| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) | +| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) | + +## Sample Configuration + +```yaml +api_key: ${env.AZURE_API_KEY:=} +api_base: ${env.AZURE_API_BASE:=} +api_version: ${env.AZURE_API_VERSION:=} +api_type: ${env.AZURE_API_TYPE:=} + +``` + diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml index 8e6c0bf67..a4d920cd6 100644 --- a/llama_stack/distributions/ci-tests/build.yaml +++ b/llama_stack/distributions/ci-tests/build.yaml @@ -17,6 +17,7 @@ distribution_spec: - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova + - provider_type: remote::azure - provider_type: inline::sentence-transformers vector_io: - provider_type: inline::faiss diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml index 26a677c7a..a478a3872 100644 --- a/llama_stack/distributions/ci-tests/run.yaml +++ b/llama_stack/distributions/ci-tests/run.yaml @@ -81,6 +81,13 @@ providers: config: url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} + - provider_id: ${env.AZURE_API_KEY:+azure} + provider_type: remote::azure + config: + api_key: ${env.AZURE_API_KEY:=} + api_base: ${env.AZURE_API_BASE:=} + api_version: ${env.AZURE_API_VERSION:=} + api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers provider_type: inline::sentence-transformers vector_io: diff --git a/llama_stack/distributions/starter-gpu/build.yaml b/llama_stack/distributions/starter-gpu/build.yaml index ff7c58e6f..05a2bf180 100644 --- a/llama_stack/distributions/starter-gpu/build.yaml +++ b/llama_stack/distributions/starter-gpu/build.yaml @@ -18,6 +18,7 @@ distribution_spec: - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova + - provider_type: remote::azure - provider_type: inline::sentence-transformers vector_io: - provider_type: inline::faiss diff --git a/llama_stack/distributions/starter-gpu/run.yaml b/llama_stack/distributions/starter-gpu/run.yaml index 5d9dfcb27..786506706 100644 --- a/llama_stack/distributions/starter-gpu/run.yaml +++ b/llama_stack/distributions/starter-gpu/run.yaml @@ -81,6 +81,13 @@ providers: config: url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} + - provider_id: ${env.AZURE_API_KEY:+azure} + provider_type: remote::azure + config: + api_key: ${env.AZURE_API_KEY:=} + api_base: ${env.AZURE_API_BASE:=} + api_version: ${env.AZURE_API_VERSION:=} + api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers provider_type: inline::sentence-transformers vector_io: diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml index e84e528da..2f0cd24fd 100644 --- a/llama_stack/distributions/starter/build.yaml +++ b/llama_stack/distributions/starter/build.yaml @@ -18,6 +18,7 @@ distribution_spec: - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova + - provider_type: remote::azure - provider_type: inline::sentence-transformers vector_io: - provider_type: inline::faiss diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml index a3962b8aa..2814b2ced 100644 --- a/llama_stack/distributions/starter/run.yaml +++ b/llama_stack/distributions/starter/run.yaml @@ -81,6 +81,13 @@ providers: config: url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:=} + - provider_id: ${env.AZURE_API_KEY:+azure} + provider_type: remote::azure + config: + api_key: ${env.AZURE_API_KEY:=} + api_base: ${env.AZURE_API_BASE:=} + api_version: ${env.AZURE_API_VERSION:=} + api_type: ${env.AZURE_API_TYPE:=} - provider_id: sentence-transformers provider_type: inline::sentence-transformers vector_io: diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py index 2fca52700..c2dfe95ad 100644 --- a/llama_stack/distributions/starter/starter.py +++ b/llama_stack/distributions/starter/starter.py @@ -59,6 +59,7 @@ ENABLED_INFERENCE_PROVIDERS = [ "cerebras", "nvidia", "bedrock", + "azure", ] INFERENCE_PROVIDER_IDS = { @@ -68,6 +69,7 @@ INFERENCE_PROVIDER_IDS = { "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}", "nvidia": "${env.NVIDIA_API_KEY:+nvidia}", "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}", + "azure": "${env.AZURE_API_KEY:+azure}", } @@ -277,5 +279,21 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate: "http://localhost:11434", "Ollama URL", ), + "AZURE_API_KEY": ( + "", + "Azure API Key", + ), + "AZURE_API_BASE": ( + "", + "Azure API Base", + ), + "AZURE_API_VERSION": ( + "", + "Azure API Version", + ), + "AZURE_API_TYPE": ( + "azure", + "Azure API Type", + ), }, ) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 8912560cb..64196152b 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -295,4 +295,19 @@ Available Models: description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="azure", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.azure", + config_class="llama_stack.providers.remote.inference.azure.AzureConfig", + provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator", + description=""" +Azure OpenAI inference provider for accessing GPT models and other Azure services. +Provider documentation +https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview +""", + ), + ), ] diff --git a/llama_stack/providers/remote/inference/azure/__init__.py b/llama_stack/providers/remote/inference/azure/__init__.py new file mode 100644 index 000000000..87bcaf309 --- /dev/null +++ b/llama_stack/providers/remote/inference/azure/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import AzureConfig + + +async def get_adapter_impl(config: AzureConfig, _deps): + from .azure import AzureInferenceAdapter + + impl = AzureInferenceAdapter(config) + await impl.initialize() + return impl diff --git a/llama_stack/providers/remote/inference/azure/azure.py b/llama_stack/providers/remote/inference/azure/azure.py new file mode 100644 index 000000000..449bbbb1c --- /dev/null +++ b/llama_stack/providers/remote/inference/azure/azure.py @@ -0,0 +1,64 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any +from urllib.parse import urljoin + +from llama_stack.apis.inference import ChatCompletionRequest +from llama_stack.providers.utils.inference.litellm_openai_mixin import ( + LiteLLMOpenAIMixin, +) +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin + +from .config import AzureConfig +from .models import MODEL_ENTRIES + + +class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): + def __init__(self, config: AzureConfig) -> None: + LiteLLMOpenAIMixin.__init__( + self, + MODEL_ENTRIES, + litellm_provider_name="azure", + api_key_from_config=config.api_key.get_secret_value(), + provider_data_api_key_field="azure_api_key", + openai_compat_api_base=str(config.api_base), + ) + self.config = config + + # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin + get_api_key = LiteLLMOpenAIMixin.get_api_key + + def get_base_url(self) -> str: + """ + Get the Azure API base URL. + + Returns the Azure API base URL from the configuration. + """ + return urljoin(str(self.config.api_base), "/openai/v1") + + async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: + # Get base parameters from parent + params = await super()._get_params(request) + + # Add Azure specific parameters + provider_data = self.get_request_provider_data() + if provider_data: + if getattr(provider_data, "azure_api_key", None): + params["api_key"] = provider_data.azure_api_key + if getattr(provider_data, "azure_api_base", None): + params["api_base"] = provider_data.azure_api_base + if getattr(provider_data, "azure_api_version", None): + params["api_version"] = provider_data.azure_api_version + if getattr(provider_data, "azure_api_type", None): + params["api_type"] = provider_data.azure_api_type + else: + params["api_key"] = self.config.api_key.get_secret_value() + params["api_base"] = str(self.config.api_base) + params["api_version"] = self.config.api_version + params["api_type"] = self.config.api_type + + return params diff --git a/llama_stack/providers/remote/inference/azure/config.py b/llama_stack/providers/remote/inference/azure/config.py new file mode 100644 index 000000000..fe9d61d53 --- /dev/null +++ b/llama_stack/providers/remote/inference/azure/config.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os +from typing import Any + +from pydantic import BaseModel, Field, HttpUrl, SecretStr + +from llama_stack.schema_utils import json_schema_type + + +class AzureProviderDataValidator(BaseModel): + azure_api_key: SecretStr = Field( + description="Azure API key for Azure", + ) + azure_api_base: HttpUrl = Field( + description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)", + ) + azure_api_version: str | None = Field( + default=None, + description="Azure API version for Azure (e.g., 2024-06-01)", + ) + azure_api_type: str | None = Field( + default="azure", + description="Azure API type for Azure (e.g., azure)", + ) + + +@json_schema_type +class AzureConfig(BaseModel): + api_key: SecretStr = Field( + description="Azure API key for Azure", + ) + api_base: HttpUrl = Field( + description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)", + ) + api_version: str | None = Field( + default_factory=lambda: os.getenv("AZURE_API_VERSION"), + description="Azure API version for Azure (e.g., 2024-12-01-preview)", + ) + api_type: str | None = Field( + default_factory=lambda: os.getenv("AZURE_API_TYPE", "azure"), + description="Azure API type for Azure (e.g., azure)", + ) + + @classmethod + def sample_run_config( + cls, + api_key: str = "${env.AZURE_API_KEY:=}", + api_base: str = "${env.AZURE_API_BASE:=}", + api_version: str = "${env.AZURE_API_VERSION:=}", + api_type: str = "${env.AZURE_API_TYPE:=}", + **kwargs, + ) -> dict[str, Any]: + return { + "api_key": api_key, + "api_base": api_base, + "api_version": api_version, + "api_type": api_type, + } diff --git a/llama_stack/providers/remote/inference/azure/models.py b/llama_stack/providers/remote/inference/azure/models.py new file mode 100644 index 000000000..64c87969b --- /dev/null +++ b/llama_stack/providers/remote/inference/azure/models.py @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, +) + +# https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions +LLM_MODEL_IDS = [ + "gpt-5", + "gpt-5-mini", + "gpt-5-nano", + "gpt-5-chat", + "o1", + "o1-mini", + "o3-mini", + "o4-mini", + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", +] + +SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]() + +MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index f9c837ebd..22dec8876 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -6,12 +6,25 @@ import time +import unicodedata import pytest from ..test_cases.test_case import TestCase +def _normalize_text(text: str) -> str: + """ + Normalize Unicode text by removing diacritical marks for comparison. + + The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun + in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct + Latin spelling. The test is failing because it's doing a simple case-insensitive string search + for "sol" but the actual response contains the diacritical mark. + """ + return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower() + + def provider_from_model(client_with_models, model_id): models = {m.identifier: m for m in client_with_models.models.list()} models.update({m.provider_resource_id: m for m in client_with_models.models.list()}) @@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id) "remote::groq", "remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404 "remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported + "remote::azure", # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation + # does not work with the specified model, gpt-5-mini. Please choose different model and try + # again. You can learn more about which models can be used with each operation here: + # https://go.microsoft.com/fwlink/?linkid=2197993.'}}"} ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") @@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_ assert len(response.choices) > 0 choice = response.choices[0] assert len(choice.text) > 5 - assert "france" in choice.text.lower() + normalized_text = _normalize_text(choice.text) + assert "france" in normalized_text @pytest.mark.parametrize( @@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models, ) message_content = response.choices[0].message.content.lower().strip() assert len(message_content) > 0 - assert expected.lower() in message_content + normalized_expected = _normalize_text(expected) + normalized_content = _normalize_text(message_content) + assert normalized_expected in normalized_content @pytest.mark.parametrize( @@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex ) streamed_content = [] for chunk in response: - if chunk.choices[0].delta.content: + # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that + if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content: streamed_content.append(chunk.choices[0].delta.content.lower().strip()) assert len(streamed_content) > 0 - assert expected.lower() in "".join(streamed_content) + normalized_expected = _normalize_text(expected) + normalized_content = _normalize_text("".join(streamed_content)) + assert normalized_expected in normalized_content @pytest.mark.parametrize( @@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode streamed_content.get(choice.index, "") + choice.delta.content.lower().strip() ) assert len(streamed_content) == 2 + normalized_expected = _normalize_text(expected) for i, content in streamed_content.items(): - assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}" + normalized_content = _normalize_text(content) + assert normalized_expected in normalized_content, ( + f"Choice {i}: Expected {normalized_expected} in {normalized_content}" + ) @pytest.mark.parametrize( @@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea content = "" response_id = None for chunk in response: - if response_id is None: + if response_id is None and chunk.id: response_id = chunk.id - if chunk.choices[0].delta.content: + if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content: content += chunk.choices[0].delta.content else: response_id = response.id @@ -410,11 +437,12 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode content = "" response_id = None for chunk in response: - if response_id is None: + if response_id is None and chunk.id: response_id = chunk.id - if delta := chunk.choices[0].delta: - if delta.content: - content += delta.content + if chunk.choices and len(chunk.choices) > 0: + if delta := chunk.choices[0].delta: + if delta.content: + content += delta.content else: response_id = response.id content = response.choices[0].message.content @@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi stream=False, ) message_content = response.choices[0].message.content.lower().strip() - assert "hello world" in message_content + normalized_content = _normalize_text(message_content) + assert "hello world" in normalized_content diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py index d7ffe5929..621084231 100644 --- a/tests/integration/inference/test_text_inference.py +++ b/tests/integration/inference/test_text_inference.py @@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id): "remote::vertexai", "remote::groq", "remote::sambanova", + "remote::azure", ) or "openai-compat" in provider.provider_type ): @@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model provider_id = models[model_id].provider_id providers = {p.provider_id: p for p in client_with_models.providers.list()} provider = providers[provider_id] - if provider.provider_type in ("remote::sambanova",): + if provider.provider_type in ("remote::sambanova", "remote::azure"): pytest.skip( f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output" ) diff --git a/tests/integration/recordings/responses/0fda25b9241c.json b/tests/integration/recordings/responses/0fda25b9241c.json new file mode 100644 index 000000000..b97ee1670 --- /dev/null +++ b/tests/integration/recordings/responses/0fda25b9241c.json @@ -0,0 +1,71 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "Which planet do humans live on?" + } + ], + "stream": false + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-CECIXqfvjuluKkZtG3q2QJoSQhBU0", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Humans live on Earth \u2014 the third planet from the Sun. It's the only known planet that naturally supports life, with a breathable atmosphere, liquid water, and temperatures suitable for living organisms.", + "refusal": null, + "role": "assistant", + "annotations": [], + "audio": null, + "function_call": null, + "tool_calls": null + }, + "content_filter_results": {} + } + ], + "created": 1757499901, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": null, + "usage": { + "completion_tokens": 112, + "prompt_tokens": 13, + "total_tokens": 125, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 64, + "rejected_prediction_tokens": 0 + }, + "prompt_tokens_details": { + "audio_tokens": 0, + "cached_tokens": 0 + } + }, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/2b2ad549510d.json b/tests/integration/recordings/responses/2b2ad549510d.json new file mode 100644 index 000000000..55a9d6426 --- /dev/null +++ b/tests/integration/recordings/responses/2b2ad549510d.json @@ -0,0 +1,448 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "Hello, world!" + } + ], + "stream": true + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": "Hello", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " world", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": "!", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " Hi", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " \u2014", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " how", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " can", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " I", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " help", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " you", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": " today", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": "?", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499910, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} diff --git a/tests/integration/recordings/responses/57b67d1b1a36.json b/tests/integration/recordings/responses/57b67d1b1a36.json new file mode 100644 index 000000000..14de1d85e --- /dev/null +++ b/tests/integration/recordings/responses/57b67d1b1a36.json @@ -0,0 +1,71 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "Which planet has rings around it with a name starting with letter S?" + } + ], + "stream": false + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-CECIkT5cbqFazpungtewksVePcUNa", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Saturn. It's the planet famous for its prominent ring system made of ice and rock.", + "refusal": null, + "role": "assistant", + "annotations": [], + "audio": null, + "function_call": null, + "tool_calls": null + }, + "content_filter_results": {} + } + ], + "created": 1757499914, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": null, + "usage": { + "completion_tokens": 156, + "prompt_tokens": 20, + "total_tokens": 176, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 128, + "rejected_prediction_tokens": 0 + }, + "prompt_tokens_details": { + "audio_tokens": 0, + "cached_tokens": 0 + } + }, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/8752115f8d0c.json b/tests/integration/recordings/responses/8752115f8d0c.json new file mode 100644 index 000000000..0e88bbfa6 --- /dev/null +++ b/tests/integration/recordings/responses/8752115f8d0c.json @@ -0,0 +1,71 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "Hello, world!" + } + ], + "stream": false + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-CECIuyylsMNXspa83k8LrD8SQadNY", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Hello! \ud83d\udc4b How can I help you today \u2014 answer a question, write or edit something, debug code, brainstorm ideas, or anything else?", + "refusal": null, + "role": "assistant", + "annotations": [], + "audio": null, + "function_call": null, + "tool_calls": null + }, + "content_filter_results": {} + } + ], + "created": 1757499924, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": null, + "usage": { + "completion_tokens": 40, + "prompt_tokens": 10, + "total_tokens": 50, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0 + }, + "prompt_tokens_details": { + "audio_tokens": 0, + "cached_tokens": 0 + } + }, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/94d11daee205.json b/tests/integration/recordings/responses/94d11daee205.json new file mode 100644 index 000000000..b6a6c3d68 --- /dev/null +++ b/tests/integration/recordings/responses/94d11daee205.json @@ -0,0 +1,1178 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What is the name of the US captial?" + } + ], + "n": 2, + "stream": true + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " capital", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " United", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " States", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " capital", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " United", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " States", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " Washington", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " Washington", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " D", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ".C", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " D", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ".", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ".C", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ".", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " District", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "official", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": "ly", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " Columbia", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " District", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": " Columbia", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": ").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIpbpLN9VO3z9pVAidTRslxRHtL", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499919, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} diff --git a/tests/integration/recordings/responses/9f3d749cc1c8.json b/tests/integration/recordings/responses/9f3d749cc1c8.json new file mode 100644 index 000000000..9a4539ab0 --- /dev/null +++ b/tests/integration/recordings/responses/9f3d749cc1c8.json @@ -0,0 +1,1150 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What's the name of the Sun in latin?" + } + ], + "stream": true + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " Latin", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " name", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "Sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "\"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "gen", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "itive", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "S", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "olis", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "\").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " It's", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " used", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " as", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " proper", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " name", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " Sun", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": ";", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " poets", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " also", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " sometimes", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " used", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " Greek", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "-derived", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " ep", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "ithe", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "ts", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " like", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "Pho", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "eb", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": "us", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": ".\"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIZYHVRY3J0EiPODz10HVzL7cIe", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499903, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} diff --git a/tests/integration/recordings/responses/c791119e6359.json b/tests/integration/recordings/responses/c791119e6359.json new file mode 100644 index 000000000..6ac123e92 --- /dev/null +++ b/tests/integration/recordings/responses/c791119e6359.json @@ -0,0 +1,98 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What's the weather in Tokyo? Use the get_weather function to get the weather." + } + ], + "stream": false, + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the weather for" + } + } + } + } + } + ] + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-CECIwq9Odd0mOJMmw7ytv8iEazH4H", + "choices": [ + { + "finish_reason": "tool_calls", + "index": 0, + "logprobs": null, + "message": { + "content": null, + "refusal": null, + "role": "assistant", + "annotations": [], + "audio": null, + "function_call": null, + "tool_calls": [ + { + "id": "call_yw18spRc1jjUlEyabbXBhB33", + "function": { + "arguments": "{\"city\":\"Tokyo\"}", + "name": "get_weather" + }, + "type": "function" + } + ] + }, + "content_filter_results": {} + } + ], + "created": 1757499926, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": null, + "usage": { + "completion_tokens": 88, + "prompt_tokens": 151, + "total_tokens": 239, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 64, + "rejected_prediction_tokens": 0 + }, + "prompt_tokens_details": { + "audio_tokens": 0, + "cached_tokens": 0 + } + }, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/d3e27b7234e2.json b/tests/integration/recordings/responses/d3e27b7234e2.json new file mode 100644 index 000000000..7f266c392 --- /dev/null +++ b/tests/integration/recordings/responses/d3e27b7234e2.json @@ -0,0 +1,2150 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What's the name of the Sun in latin?" + } + ], + "n": 2, + "stream": true + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "In", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Latin", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Sun", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " called", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "Sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " gen", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "itive", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Latin", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " name", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " masculine", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "Sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " name", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "s", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " also", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\u014d", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " used", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "l", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " for", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "),", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " gen", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Roman", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "itive", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " sun", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " god", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "s", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\u014d", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "e", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "lis", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ".g", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\".", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ".,", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " ", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Sol", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " As", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " Inv", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " an", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "ict", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " epit", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "us", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "het", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " it", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\u2019s", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " also", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " called", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " \"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "Pho", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "eb", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "us", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": "\"", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " in", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": " poetry", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": ".", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIdmgM7bbQr6YefuUbY4cycibvm", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 1, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499907, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} diff --git a/tests/integration/recordings/responses/fb785db7fafd.json b/tests/integration/recordings/responses/fb785db7fafd.json new file mode 100644 index 000000000..086d211e8 --- /dev/null +++ b/tests/integration/recordings/responses/fb785db7fafd.json @@ -0,0 +1,310 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What's the weather in Tokyo? Use the get_weather function to get the weather." + } + ], + "stream": true, + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the weather for" + } + } + } + } + } + ] + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": [ + { + "index": 0, + "id": "call_TMbEoYn9q0ZKtoxav5LpD9Ts", + "function": { + "arguments": "", + "name": "get_weather" + }, + "type": "function" + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": [ + { + "index": 0, + "id": null, + "function": { + "arguments": "{\"", + "name": null + }, + "type": null + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": [ + { + "index": 0, + "id": null, + "function": { + "arguments": "city", + "name": null + }, + "type": null + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": [ + { + "index": 0, + "id": null, + "function": { + "arguments": "\":\"", + "name": null + }, + "type": null + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": [ + { + "index": 0, + "id": null, + "function": { + "arguments": "Tokyo", + "name": null + }, + "type": null + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": [ + { + "index": 0, + "id": null, + "function": { + "arguments": "\"}", + "name": null + }, + "type": null + } + ] + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "tool_calls", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499912, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} diff --git a/tests/integration/recordings/responses/ff3271401fb4.json b/tests/integration/recordings/responses/ff3271401fb4.json new file mode 100644 index 000000000..bf7ec89f7 --- /dev/null +++ b/tests/integration/recordings/responses/ff3271401fb4.json @@ -0,0 +1,556 @@ +{ + "request": { + "method": "POST", + "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "gpt-5-mini", + "messages": [ + { + "role": "user", + "content": "What is the name of the US captial?" + } + ], + "stream": true + }, + "endpoint": "/v1/chat/completions", + "model": "gpt-5-mini" + }, + "response": { + "body": [ + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "", + "choices": [], + "created": 0, + "model": "", + "object": "", + "service_tier": null, + "system_fingerprint": null, + "usage": null, + "prompt_filter_results": [ + { + "prompt_index": 0, + "content_filter_results": {} + } + ] + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": "", + "function_call": null, + "refusal": null, + "role": "assistant", + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": "The", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " capital", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " the", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " United", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " States", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " is", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " Washington", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": ",", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " D", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": ".C", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": ".", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " (", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": "District", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " of", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": " Columbia", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": ").", + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": null, + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + }, + { + "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", + "__data__": { + "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs", + "choices": [ + { + "delta": { + "content": null, + "function_call": null, + "refusal": null, + "role": null, + "tool_calls": null + }, + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "content_filter_results": {} + } + ], + "created": 1757499916, + "model": "gpt-5-mini-2025-08-07", + "object": "chat.completion.chunk", + "service_tier": null, + "system_fingerprint": null, + "usage": null + } + } + ], + "is_streaming": true + } +} From d15368a3026450d1474f4a4db47b89fd3e6057ca Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Thu, 11 Sep 2025 06:20:11 -0600 Subject: [PATCH 18/30] chore: Updating documentation, adding exception handling for Vector Stores in RAG Tool, more tests on migration, and migrate off of inference_api for context_retriever for RAG (#3367) # What does this PR do? - Updating documentation on migration from RAG Tool to Vector Stores and Files APIs - Adding exception handling for Vector Stores in RAG Tool - Add more tests on migration from RAG Tool to Vector Stores - Migrate off of inference_api for context_retriever for RAG ## Test Plan Integration and unit tests added Signed-off-by: Francisco Javier Arceo --- docs/source/building_applications/rag.md | 21 ++ .../tool_runtime/rag/context_retriever.py | 12 +- .../inline/tool_runtime/rag/memory.py | 121 ++++++---- .../integration/tool_runtime/test_rag_tool.py | 208 ++++++++++++++++++ .../utils/memory/test_vector_store.py | 38 ++++ 5 files changed, 355 insertions(+), 45 deletions(-) diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md index 289c38991..802859e87 100644 --- a/docs/source/building_applications/rag.md +++ b/docs/source/building_applications/rag.md @@ -93,10 +93,31 @@ chunks_response = client.vector_io.query( ### Using the RAG Tool +> **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search +> API. We recommend migrating to the OpenAI APIs for better compatibility and future support. + A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples). +#### OpenAI API Integration & Migration + +The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits: + +- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints +- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies +- **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing. + +**Migration Path:** +We recommend migrating to the OpenAI-compatible Search API for: +1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API +2**Future-Proof**: Continued support and feature development +3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API + +The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. +However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any +documents fail to process, they will be logged in the response but will not cause the entire operation to fail. + ```python from llama_stack_client import RAGDocument diff --git a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py index be18430e4..9bc22f979 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py @@ -8,7 +8,7 @@ from jinja2 import Template from llama_stack.apis.common.content_types import InterleavedContent -from llama_stack.apis.inference import UserMessage +from llama_stack.apis.inference import OpenAIUserMessageParam from llama_stack.apis.tools.rag_tool import ( DefaultRAGQueryGeneratorConfig, LLMRAGQueryGeneratorConfig, @@ -61,16 +61,16 @@ async def llm_rag_query_generator( messages = [interleaved_content_as_str(content)] template = Template(config.template) - content = template.render({"messages": messages}) + rendered_content: str = template.render({"messages": messages}) model = config.model - message = UserMessage(content=content) - response = await inference_api.chat_completion( - model_id=model, + message = OpenAIUserMessageParam(content=rendered_content) + response = await inference_api.openai_chat_completion( + model=model, messages=[message], stream=False, ) - query = response.completion_message.content + query = response.choices[0].message.content return query diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py index aa629cca8..bc68f198d 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/memory.py +++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py @@ -45,10 +45,7 @@ from llama_stack.apis.vector_io import ( from llama_stack.log import get_logger from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str -from llama_stack.providers.utils.memory.vector_store import ( - content_from_doc, - parse_data_url, -) +from llama_stack.providers.utils.memory.vector_store import parse_data_url from .config import RagToolRuntimeConfig from .context_retriever import generate_rag_query @@ -60,6 +57,47 @@ def make_random_string(length: int = 8): return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length)) +async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]: + """Get raw binary data and mime type from a RAGDocument for file upload.""" + if isinstance(doc.content, URL): + if doc.content.uri.startswith("data:"): + parts = parse_data_url(doc.content.uri) + mime_type = parts["mimetype"] + data = parts["data"] + + if parts["is_base64"]: + file_data = base64.b64decode(data) + else: + file_data = data.encode("utf-8") + + return file_data, mime_type + else: + async with httpx.AsyncClient() as client: + r = await client.get(doc.content.uri) + r.raise_for_status() + mime_type = r.headers.get("content-type", "application/octet-stream") + return r.content, mime_type + else: + if isinstance(doc.content, str): + content_str = doc.content + else: + content_str = interleaved_content_as_str(doc.content) + + if content_str.startswith("data:"): + parts = parse_data_url(content_str) + mime_type = parts["mimetype"] + data = parts["data"] + + if parts["is_base64"]: + file_data = base64.b64decode(data) + else: + file_data = data.encode("utf-8") + + return file_data, mime_type + else: + return content_str.encode("utf-8"), "text/plain" + + class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime): def __init__( self, @@ -95,46 +133,52 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti return for doc in documents: - if isinstance(doc.content, URL): - if doc.content.uri.startswith("data:"): - parts = parse_data_url(doc.content.uri) - file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode() - mime_type = parts["mimetype"] - else: - async with httpx.AsyncClient() as client: - response = await client.get(doc.content.uri) - file_data = response.content - mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream") - else: - content_str = await content_from_doc(doc) - file_data = content_str.encode("utf-8") - mime_type = doc.mime_type or "text/plain" + try: + try: + file_data, mime_type = await raw_data_from_doc(doc) + except Exception as e: + log.error(f"Failed to extract content from document {doc.document_id}: {e}") + continue - file_extension = mimetypes.guess_extension(mime_type) or ".txt" - filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}") + file_extension = mimetypes.guess_extension(mime_type) or ".txt" + filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}") - file_obj = io.BytesIO(file_data) - file_obj.name = filename + file_obj = io.BytesIO(file_data) + file_obj.name = filename - upload_file = UploadFile(file=file_obj, filename=filename) + upload_file = UploadFile(file=file_obj, filename=filename) - created_file = await self.files_api.openai_upload_file( - file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS - ) + try: + created_file = await self.files_api.openai_upload_file( + file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS + ) + except Exception as e: + log.error(f"Failed to upload file for document {doc.document_id}: {e}") + continue - chunking_strategy = VectorStoreChunkingStrategyStatic( - static=VectorStoreChunkingStrategyStaticConfig( - max_chunk_size_tokens=chunk_size_in_tokens, - chunk_overlap_tokens=chunk_size_in_tokens // 4, + chunking_strategy = VectorStoreChunkingStrategyStatic( + static=VectorStoreChunkingStrategyStaticConfig( + max_chunk_size_tokens=chunk_size_in_tokens, + chunk_overlap_tokens=chunk_size_in_tokens // 4, + ) ) - ) - await self.vector_io_api.openai_attach_file_to_vector_store( - vector_store_id=vector_db_id, - file_id=created_file.id, - attributes=doc.metadata, - chunking_strategy=chunking_strategy, - ) + try: + await self.vector_io_api.openai_attach_file_to_vector_store( + vector_store_id=vector_db_id, + file_id=created_file.id, + attributes=doc.metadata, + chunking_strategy=chunking_strategy, + ) + except Exception as e: + log.error( + f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}" + ) + continue + + except Exception as e: + log.error(f"Unexpected error processing document {doc.document_id}: {e}") + continue async def query( self, @@ -274,7 +318,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti if query_config: query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config) else: - # handle someone passing an empty dict query_config = RAGQueryConfig() query = kwargs["query"] @@ -285,6 +328,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti ) return ToolInvocationResult( - content=result.content, + content=result.content or [], metadata=result.metadata, ) diff --git a/tests/integration/tool_runtime/test_rag_tool.py b/tests/integration/tool_runtime/test_rag_tool.py index b208500d8..b78c39af8 100644 --- a/tests/integration/tool_runtime/test_rag_tool.py +++ b/tests/integration/tool_runtime/test_rag_tool.py @@ -183,6 +183,110 @@ def test_vector_db_insert_from_url_and_query( assert any("llama2" in chunk.content.lower() for chunk in response2.chunks) +def test_rag_tool_openai_apis(client_with_empty_registry, embedding_model_id, embedding_dimension): + vector_db_id = "test_openai_vector_db" + + client_with_empty_registry.vector_dbs.register( + vector_db_id=vector_db_id, + embedding_model=embedding_model_id, + embedding_dimension=embedding_dimension, + ) + + available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()] + actual_vector_db_id = available_vector_dbs[0] + + # different document formats that should work with OpenAI APIs + documents = [ + Document( + document_id="text-doc", + content="This is a plain text document about machine learning algorithms.", + metadata={"type": "text", "category": "AI"}, + ), + Document( + document_id="url-doc", + content="https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst", + mime_type="text/plain", + metadata={"type": "url", "source": "pytorch"}, + ), + Document( + document_id="data-url-doc", + content="data:text/plain;base64,VGhpcyBpcyBhIGRhdGEgVVJMIGRvY3VtZW50IGFib3V0IGRlZXAgbGVhcm5pbmcu", # "This is a data URL document about deep learning." + metadata={"type": "data_url", "encoding": "base64"}, + ), + ] + + client_with_empty_registry.tool_runtime.rag_tool.insert( + documents=documents, + vector_db_id=actual_vector_db_id, + chunk_size_in_tokens=256, + ) + + files_list = client_with_empty_registry.files.list() + assert len(files_list.data) >= len(documents), ( + f"Expected at least {len(documents)} files, got {len(files_list.data)}" + ) + + vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store( + vector_store_id=actual_vector_db_id + ) + assert len(vector_store_files.data) >= len(documents), f"Expected at least {len(documents)} files in vector store" + + response = client_with_empty_registry.tool_runtime.rag_tool.query( + vector_db_ids=[actual_vector_db_id], + content="Tell me about machine learning and deep learning", + ) + + assert_valid_text_response(response) + content_text = " ".join([chunk.text for chunk in response.content]).lower() + assert "machine learning" in content_text or "deep learning" in content_text + + +def test_rag_tool_exception_handling(client_with_empty_registry, embedding_model_id, embedding_dimension): + vector_db_id = "test_exception_handling" + + client_with_empty_registry.vector_dbs.register( + vector_db_id=vector_db_id, + embedding_model=embedding_model_id, + embedding_dimension=embedding_dimension, + ) + + available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()] + actual_vector_db_id = available_vector_dbs[0] + + documents = [ + Document( + document_id="valid-doc", + content="This is a valid document that should be processed successfully.", + metadata={"status": "valid"}, + ), + Document( + document_id="invalid-url-doc", + content="https://nonexistent-domain-12345.com/invalid.txt", + metadata={"status": "invalid_url"}, + ), + Document( + document_id="another-valid-doc", + content="This is another valid document for testing resilience.", + metadata={"status": "valid"}, + ), + ] + + client_with_empty_registry.tool_runtime.rag_tool.insert( + documents=documents, + vector_db_id=actual_vector_db_id, + chunk_size_in_tokens=256, + ) + + response = client_with_empty_registry.tool_runtime.rag_tool.query( + vector_db_ids=[actual_vector_db_id], + content="valid document", + ) + + assert_valid_text_response(response) + content_text = " ".join([chunk.text for chunk in response.content]).lower() + assert "valid document" in content_text + + def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension): providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"] assert len(providers) > 0 @@ -249,3 +353,107 @@ def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_i "chunk_template": "This should raise a ValueError because it is missing the proper template variables", }, ) + + +def test_rag_tool_query_generation(client_with_empty_registry, embedding_model_id, embedding_dimension): + vector_db_id = "test_query_generation_db" + + client_with_empty_registry.vector_dbs.register( + vector_db_id=vector_db_id, + embedding_model=embedding_model_id, + embedding_dimension=embedding_dimension, + ) + + available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()] + actual_vector_db_id = available_vector_dbs[0] + + documents = [ + Document( + document_id="ai-doc", + content="Artificial intelligence and machine learning are transforming technology.", + metadata={"category": "AI"}, + ), + Document( + document_id="banana-doc", + content="Don't bring a banana to a knife fight.", + metadata={"category": "wisdom"}, + ), + ] + + client_with_empty_registry.tool_runtime.rag_tool.insert( + documents=documents, + vector_db_id=actual_vector_db_id, + chunk_size_in_tokens=256, + ) + + response = client_with_empty_registry.tool_runtime.rag_tool.query( + vector_db_ids=[actual_vector_db_id], + content="Tell me about AI", + ) + + assert_valid_text_response(response) + content_text = " ".join([chunk.text for chunk in response.content]).lower() + assert "artificial intelligence" in content_text or "machine learning" in content_text + + +def test_rag_tool_pdf_data_url_handling(client_with_empty_registry, embedding_model_id, embedding_dimension): + vector_db_id = "test_pdf_data_url_db" + + client_with_empty_registry.vector_dbs.register( + vector_db_id=vector_db_id, + embedding_model=embedding_model_id, + embedding_dimension=embedding_dimension, + ) + + available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()] + actual_vector_db_id = available_vector_dbs[0] + + sample_pdf = b"%PDF-1.3\n3 0 obj\n<>\nendobj\n4 0 obj\n<>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<>\nendobj\n5 0 obj\n<>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Llama Stack Developers)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n" + + import base64 + + pdf_base64 = base64.b64encode(sample_pdf).decode("utf-8") + pdf_data_url = f"data:application/pdf;base64,{pdf_base64}" + + documents = [ + Document( + document_id="test-pdf-data-url", + content=pdf_data_url, + metadata={"type": "pdf", "source": "data_url"}, + ), + ] + + client_with_empty_registry.tool_runtime.rag_tool.insert( + documents=documents, + vector_db_id=actual_vector_db_id, + chunk_size_in_tokens=256, + ) + + files_list = client_with_empty_registry.files.list() + assert len(files_list.data) >= 1, "PDF should have been uploaded to Files API" + + pdf_file = None + for file in files_list.data: + if file.filename and "test-pdf-data-url" in file.filename: + pdf_file = file + break + + assert pdf_file is not None, "PDF file should be found in Files API" + assert pdf_file.bytes == len(sample_pdf), f"File size should match original PDF ({len(sample_pdf)} bytes)" + + file_content = client_with_empty_registry.files.retrieve_content(pdf_file.id) + assert file_content.startswith(b"%PDF-"), "Retrieved file should be a valid PDF" + + vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store( + vector_store_id=actual_vector_db_id + ) + assert len(vector_store_files.data) >= 1, "PDF should be attached to vector store" + + response = client_with_empty_registry.tool_runtime.rag_tool.query( + vector_db_ids=[actual_vector_db_id], + content="sample title", + ) + + assert_valid_text_response(response) + content_text = " ".join([chunk.text for chunk in response.content]).lower() + assert "sample title" in content_text or "title" in content_text diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py index 90b229262..590bdd1d2 100644 --- a/tests/unit/providers/utils/memory/test_vector_store.py +++ b/tests/unit/providers/utils/memory/test_vector_store.py @@ -178,3 +178,41 @@ def test_content_from_data_and_mime_type_both_encodings_fail(): # Should raise an exception instead of returning empty string with pytest.raises(UnicodeDecodeError): content_from_data_and_mime_type(data, mime_type) + + +async def test_memory_tool_error_handling(): + """Test that memory tool handles various failures gracefully without crashing.""" + from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig + from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl + + config = RagToolRuntimeConfig() + memory_tool = MemoryToolRuntimeImpl( + config=config, + vector_io_api=AsyncMock(), + inference_api=AsyncMock(), + files_api=AsyncMock(), + ) + + docs = [ + RAGDocument(document_id="good_doc", content="Good content", metadata={}), + RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}), + RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}), + ] + + mock_file1 = MagicMock() + mock_file1.id = "file_good1" + mock_file2 = MagicMock() + mock_file2.id = "file_good2" + memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2] + + with patch("httpx.AsyncClient") as mock_client: + mock_instance = AsyncMock() + mock_instance.get.side_effect = Exception("Bad URL") + mock_client.return_value.__aenter__.return_value = mock_instance + + # won't raise exception despite one document failing + await memory_tool.insert(docs, "vector_store_123") + + # processed 2 documents successfully, skipped 1 + assert memory_tool.files_api.openai_upload_file.call_count == 2 + assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2 From 8ef1189be7c6ea6e9fb2e3cf3f502123e0e4635a Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Thu, 11 Sep 2025 09:04:38 -0400 Subject: [PATCH 19/30] chore: update the vLLM inference impl to use OpenAIMixin for openai-compat functions (#3404) # What does this PR do? update vLLM inference provider to use OpenAIMixin for openai-compat functions inference recordings from Qwen3-0.6B and vLLM 0.8.3 - ``` docker run --gpus all -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host \ vllm/vllm-openai:latest \ --model Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes ``` ## Test Plan ``` ./scripts/integration-tests.sh --stack-config server:ci-tests --setup vllm --subdirs inference ``` --- .../providers/remote/inference/vllm/vllm.py | 197 +----------------- .../providers/utils/inference/openai_mixin.py | 28 ++- .../providers/inference/test_remote_vllm.py | 21 +- 3 files changed, 44 insertions(+), 202 deletions(-) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 9e9a80ca5..77f5d82af 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. import json -from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import AsyncGenerator from typing import Any import httpx @@ -38,13 +38,6 @@ from llama_stack.apis.inference import ( LogProbConfig, Message, ModelStore, - OpenAIChatCompletion, - OpenAICompletion, - OpenAIEmbeddingData, - OpenAIEmbeddingsResponse, - OpenAIEmbeddingUsage, - OpenAIMessageParam, - OpenAIResponseFormatParam, ResponseFormat, SamplingParams, TextTruncation, @@ -71,11 +64,11 @@ from llama_stack.providers.utils.inference.openai_compat import ( convert_message_to_openai_dict, convert_tool_call, get_sampling_options, - prepare_openai_completion_params, process_chat_completion_stream_response, process_completion_response, process_completion_stream_response, ) +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.prompt_adapter import ( completion_request_to_prompt, content_has_media, @@ -288,7 +281,7 @@ async def _process_vllm_chat_completion_stream_response( yield c -class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): +class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate): # automatically set by the resolver when instantiating the provider __provider_id__: str model_store: ModelStore | None = None @@ -296,7 +289,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): def __init__(self, config: VLLMInferenceAdapterConfig) -> None: self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries()) self.config = config - self.client = None async def initialize(self) -> None: if not self.config.url: @@ -308,8 +300,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): return self.config.refresh_models async def list_models(self) -> list[Model] | None: - self._lazy_initialize_client() - assert self.client is not None # mypy models = [] async for m in self.client.models.list(): model_type = ModelType.llm # unclear how to determine embedding vs. llm models @@ -340,8 +330,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): HealthResponse: A dictionary containing the health status. """ try: - client = self._create_client() if self.client is None else self.client - _ = [m async for m in client.models.list()] # Ensure the client is initialized + _ = [m async for m in self.client.models.list()] # Ensure the client is initialized return HealthResponse(status=HealthStatus.OK) except Exception as e: return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") @@ -351,19 +340,14 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): raise ValueError("Model store not set") return await self.model_store.get_model(model_id) - def _lazy_initialize_client(self): - if self.client is not None: - return + def get_api_key(self): + return self.config.api_token - log.info(f"Initializing vLLM client with base_url={self.config.url}") - self.client = self._create_client() + def get_base_url(self): + return self.config.url - def _create_client(self): - return AsyncOpenAI( - base_url=self.config.url, - api_key=self.config.api_token, - http_client=httpx.AsyncClient(verify=self.config.tls_verify), - ) + def get_extra_client_params(self): + return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)} async def completion( self, @@ -374,7 +358,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): stream: bool | None = False, logprobs: LogProbConfig | None = None, ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]: - self._lazy_initialize_client() if sampling_params is None: sampling_params = SamplingParams() model = await self._get_model(model_id) @@ -406,7 +389,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): logprobs: LogProbConfig | None = None, tool_config: ToolConfig | None = None, ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]: - self._lazy_initialize_client() if sampling_params is None: sampling_params = SamplingParams() model = await self._get_model(model_id) @@ -479,16 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): yield chunk async def register_model(self, model: Model) -> Model: - # register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet. - # self.client should only be created after the initialization is complete to avoid asyncio cross-context errors. - # Changing this may lead to unpredictable behavior. - client = self._create_client() if self.client is None else self.client try: model = await self.register_helper.register_model(model) except ValueError: pass # Ignore statically unknown model, will check live listing try: - res = await client.models.list() + res = await self.client.models.list() except APIConnectionError as e: raise ValueError( f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." @@ -543,8 +521,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): output_dimension: int | None = None, task_type: EmbeddingTaskType | None = None, ) -> EmbeddingsResponse: - self._lazy_initialize_client() - assert self.client is not None model = await self._get_model(model_id) kwargs = {} @@ -560,154 +536,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): embeddings = [data.embedding for data in response.data] return EmbeddingsResponse(embeddings=embeddings) - - async def openai_embeddings( - self, - model: str, - input: str | list[str], - encoding_format: str | None = "float", - dimensions: int | None = None, - user: str | None = None, - ) -> OpenAIEmbeddingsResponse: - self._lazy_initialize_client() - assert self.client is not None - model_obj = await self._get_model(model) - assert model_obj.model_type == ModelType.embedding - - # Convert input to list if it's a string - input_list = [input] if isinstance(input, str) else input - - # Call vLLM embeddings endpoint with encoding_format - response = await self.client.embeddings.create( - model=model_obj.provider_resource_id, - input=input_list, - dimensions=dimensions, - encoding_format=encoding_format, - ) - - # Convert response to OpenAI format - data = [ - OpenAIEmbeddingData( - embedding=embedding_data.embedding, - index=i, - ) - for i, embedding_data in enumerate(response.data) - ] - - # Not returning actual token usage since vLLM doesn't provide it - usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1) - - return OpenAIEmbeddingsResponse( - data=data, - model=model_obj.provider_resource_id, - usage=usage, - ) - - async def openai_completion( - self, - model: str, - prompt: str | list[str] | list[int] | list[list[int]], - best_of: int | None = None, - echo: bool | None = None, - frequency_penalty: float | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_tokens: int | None = None, - n: int | None = None, - presence_penalty: float | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - top_p: float | None = None, - user: str | None = None, - guided_choice: list[str] | None = None, - prompt_logprobs: int | None = None, - suffix: str | None = None, - ) -> OpenAICompletion: - self._lazy_initialize_client() - model_obj = await self._get_model(model) - - extra_body: dict[str, Any] = {} - if prompt_logprobs is not None and prompt_logprobs >= 0: - extra_body["prompt_logprobs"] = prompt_logprobs - if guided_choice: - extra_body["guided_choice"] = guided_choice - - params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, - prompt=prompt, - best_of=best_of, - echo=echo, - frequency_penalty=frequency_penalty, - logit_bias=logit_bias, - logprobs=logprobs, - max_tokens=max_tokens, - n=n, - presence_penalty=presence_penalty, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - top_p=top_p, - user=user, - extra_body=extra_body, - ) - return await self.client.completions.create(**params) # type: ignore - - async def openai_chat_completion( - self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, - ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: - self._lazy_initialize_client() - model_obj = await self._get_model(model) - params = await prepare_openai_completion_params( - model=model_obj.provider_resource_id, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) - return await self.client.chat.completions.create(**params) # type: ignore diff --git a/llama_stack/providers/utils/inference/openai_mixin.py b/llama_stack/providers/utils/inference/openai_mixin.py index f60deee6e..a3c0ffadc 100644 --- a/llama_stack/providers/utils/inference/openai_mixin.py +++ b/llama_stack/providers/utils/inference/openai_mixin.py @@ -67,6 +67,17 @@ class OpenAIMixin(ABC): """ pass + def get_extra_client_params(self) -> dict[str, Any]: + """ + Get any extra parameters to pass to the AsyncOpenAI client. + + Child classes can override this method to provide additional parameters + such as timeout settings, proxies, etc. + + :return: A dictionary of extra parameters + """ + return {} + @property def client(self) -> AsyncOpenAI: """ @@ -78,6 +89,7 @@ class OpenAIMixin(ABC): return AsyncOpenAI( api_key=self.get_api_key(), base_url=self.get_base_url(), + **self.get_extra_client_params(), ) async def _get_provider_model_id(self, model: str) -> str: @@ -124,10 +136,15 @@ class OpenAIMixin(ABC): """ Direct OpenAI completion API call. """ - if guided_choice is not None: - logger.warning("guided_choice is not supported by the OpenAI API. Ignoring.") - if prompt_logprobs is not None: - logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.") + # Handle parameters that are not supported by OpenAI API, but may be by the provider + # prompt_logprobs is supported by vLLM + # guided_choice is supported by vLLM + # TODO: test coverage + extra_body: dict[str, Any] = {} + if prompt_logprobs is not None and prompt_logprobs >= 0: + extra_body["prompt_logprobs"] = prompt_logprobs + if guided_choice: + extra_body["guided_choice"] = guided_choice # TODO: fix openai_completion to return type compatible with OpenAI's API response return await self.client.completions.create( # type: ignore[no-any-return] @@ -150,7 +167,8 @@ class OpenAIMixin(ABC): top_p=top_p, user=user, suffix=suffix, - ) + ), + extra_body=extra_body, ) async def openai_chat_completion( diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index ce0e930b1..a48af2a1d 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -11,7 +11,7 @@ import threading import time from http.server import BaseHTTPRequestHandler, HTTPServer from typing import Any -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch import pytest from openai.types.chat.chat_completion_chunk import ( @@ -150,10 +150,12 @@ async def test_tool_call_response(vllm_inference_adapter): """Verify that tool call arguments from a CompletionMessage are correctly converted into the expected JSON format.""" - # Patch the call to vllm so we can inspect the arguments sent were correct - with patch.object( - vllm_inference_adapter.client.chat.completions, "create", new_callable=AsyncMock - ) as mock_nonstream_completion: + # Patch the client property to avoid instantiating a real AsyncOpenAI client + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client: + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock() + mock_create_client.return_value = mock_client + messages = [ SystemMessage(content="You are a helpful assistant"), UserMessage(content="How many?"), @@ -179,7 +181,7 @@ async def test_tool_call_response(vllm_inference_adapter): tool_config=ToolConfig(tool_choice=ToolChoice.auto), ) - assert mock_nonstream_completion.call_args.kwargs["messages"][2]["tool_calls"] == [ + assert mock_client.chat.completions.create.call_args.kwargs["messages"][2]["tool_calls"] == [ { "id": "foo", "type": "function", @@ -641,9 +643,7 @@ async def test_health_status_success(vllm_inference_adapter): This test verifies that the health method returns a HealthResponse with status OK, only when the connection to the vLLM server is successful. """ - # Set vllm_inference_adapter.client to None to ensure _create_client is called - vllm_inference_adapter.client = None - with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client: + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client: # Create mock client and models mock_client = MagicMock() mock_models = MagicMock() @@ -674,8 +674,7 @@ async def test_health_status_failure(vllm_inference_adapter): This test verifies that the health method returns a HealthResponse with status ERROR and an appropriate error message when the connection to the vLLM server fails. """ - vllm_inference_adapter.client = None - with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client: + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client: # Create mock client and models mock_client = MagicMock() mock_models = MagicMock() From 72387b4bd229bba60b43f95679da62630fc0f3c7 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Thu, 11 Sep 2025 11:45:16 -0400 Subject: [PATCH 20/30] chore(unit tests): remove network use, update async test (#3418) # What does this PR do? update the async detection test for vllm - remove a network access from unit tests - remove direct logging use the idea behind the test is to mock inference w/ a sleep, initiate concurrent inference calls, verify the total execution time is close to the sleep time. in a non-async env the total time would be closer to sleep * num concurrent calls. ## Test Plan ci --- .../providers/inference/test_remote_vllm.py | 160 +++++++----------- 1 file changed, 60 insertions(+), 100 deletions(-) diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py index a48af2a1d..61b16b5d1 100644 --- a/tests/unit/providers/inference/test_remote_vllm.py +++ b/tests/unit/providers/inference/test_remote_vllm.py @@ -6,11 +6,7 @@ import asyncio import json -import logging # allow-direct-logging -import threading import time -from http.server import BaseHTTPRequestHandler, HTTPServer -from typing import Any from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch import pytest @@ -18,7 +14,7 @@ from openai.types.chat.chat_completion_chunk import ( ChatCompletionChunk as OpenAIChatCompletionChunk, ) from openai.types.chat.chat_completion_chunk import ( - Choice as OpenAIChoice, + Choice as OpenAIChoiceChunk, ) from openai.types.chat.chat_completion_chunk import ( ChoiceDelta as OpenAIChoiceDelta, @@ -35,6 +31,9 @@ from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponseEventType, CompletionMessage, + OpenAIAssistantMessageParam, + OpenAIChatCompletion, + OpenAIChoice, SystemMessage, ToolChoice, ToolConfig, @@ -61,41 +60,6 @@ from llama_stack.providers.remote.inference.vllm.vllm import ( # -v -s --tb=short --disable-warnings -class MockInferenceAdapterWithSleep: - def __init__(self, sleep_time: int, response: dict[str, Any]): - self.httpd = None - - class DelayedRequestHandler(BaseHTTPRequestHandler): - # ruff: noqa: N802 - def do_POST(self): - time.sleep(sleep_time) - response_body = json.dumps(response).encode("utf-8") - self.send_response(code=200) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", len(response_body)) - self.end_headers() - self.wfile.write(response_body) - - self.request_handler = DelayedRequestHandler - - def __enter__(self): - httpd = HTTPServer(("", 0), self.request_handler) - self.httpd = httpd - host, port = httpd.server_address - httpd_thread = threading.Thread(target=httpd.serve_forever) - httpd_thread.daemon = True # stop server if this thread terminates - httpd_thread.start() - - config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}") - inference_adapter = VLLMInferenceAdapter(config) - return inference_adapter - - def __exit__(self, _exc_type, _exc_value, _traceback): - if self.httpd: - self.httpd.shutdown() - self.httpd.server_close() - - @pytest.fixture(scope="module") def mock_openai_models_list(): with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list: @@ -201,7 +165,7 @@ async def test_tool_call_delta_empty_tool_call_buf(): async def mock_stream(): delta = OpenAIChoiceDelta(content="", tool_calls=None) - choices = [OpenAIChoice(delta=delta, finish_reason="stop", index=0)] + choices = [OpenAIChoiceChunk(delta=delta, finish_reason="stop", index=0)] mock_chunk = OpenAIChatCompletionChunk( id="chunk-1", created=1, @@ -227,7 +191,7 @@ async def test_tool_call_delta_streaming_arguments_dict(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice( + OpenAIChoiceChunk( delta=OpenAIChoiceDelta( content="", tool_calls=[ @@ -252,7 +216,7 @@ async def test_tool_call_delta_streaming_arguments_dict(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice( + OpenAIChoiceChunk( delta=OpenAIChoiceDelta( content="", tool_calls=[ @@ -277,7 +241,9 @@ async def test_tool_call_delta_streaming_arguments_dict(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0) + OpenAIChoiceChunk( + delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0 + ) ], ) for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]: @@ -301,7 +267,7 @@ async def test_multiple_tool_calls(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice( + OpenAIChoiceChunk( delta=OpenAIChoiceDelta( content="", tool_calls=[ @@ -326,7 +292,7 @@ async def test_multiple_tool_calls(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice( + OpenAIChoiceChunk( delta=OpenAIChoiceDelta( content="", tool_calls=[ @@ -351,7 +317,9 @@ async def test_multiple_tool_calls(): model="foo", object="chat.completion.chunk", choices=[ - OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0) + OpenAIChoiceChunk( + delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0 + ) ], ) for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]: @@ -395,59 +363,6 @@ async def test_process_vllm_chat_completion_stream_response_no_choices(): assert chunks[0].event.event_type.value == "start" -@pytest.mark.allow_network -def test_chat_completion_doesnt_block_event_loop(caplog): - loop = asyncio.new_event_loop() - loop.set_debug(True) - caplog.set_level(logging.WARNING) - - # Log when event loop is blocked for more than 200ms - loop.slow_callback_duration = 0.5 - # Sleep for 500ms in our delayed http response - sleep_time = 0.5 - - mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference") - mock_response = { - "id": "chatcmpl-abc123", - "object": "chat.completion", - "created": 1, - "modle": "mock-model", - "choices": [ - { - "message": {"content": ""}, - "logprobs": None, - "finish_reason": "stop", - "index": 0, - } - ], - } - - async def do_chat_completion(): - await inference_adapter.chat_completion( - "mock-model", - [], - stream=False, - tools=None, - tool_config=ToolConfig(tool_choice=ToolChoice.auto), - ) - - with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter: - inference_adapter.model_store = AsyncMock() - inference_adapter.model_store.get_model.return_value = mock_model - loop.run_until_complete(inference_adapter.initialize()) - - # Clear the logs so far and run the actual chat completion we care about - caplog.clear() - loop.run_until_complete(do_chat_completion()) - - # Ensure we don't have any asyncio warnings in the captured log - # records from our chat completion call. A message gets logged - # here any time we exceed the slow_callback_duration configured - # above. - asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"] - assert not asyncio_warnings - - async def test_get_params_empty_tools(vllm_inference_adapter): request = ChatCompletionRequest( tools=[], @@ -696,3 +611,48 @@ async def test_health_status_failure(vllm_inference_adapter): assert "Health check failed: Connection failed" in health_response["message"] mock_models.list.assert_called_once() + + +async def test_openai_chat_completion_is_async(vllm_inference_adapter): + """ + Verify that openai_chat_completion is async and doesn't block the event loop. + + To do this we mock the underlying inference with a sleep, start multiple + inference calls in parallel, and ensure the total time taken is less + than the sum of the individual sleep times. + """ + sleep_time = 0.5 + + async def mock_create(*args, **kwargs): + await asyncio.sleep(sleep_time) + return OpenAIChatCompletion( + id="chatcmpl-abc123", + created=1, + model="mock-model", + choices=[ + OpenAIChoice( + message=OpenAIAssistantMessageParam( + content="nothing interesting", + ), + finish_reason="stop", + index=0, + ) + ], + ) + + async def do_inference(): + await vllm_inference_adapter.openai_chat_completion( + "mock-model", messages=["one fish", "two fish"], stream=False + ) + + with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client: + mock_client = MagicMock() + mock_client.chat.completions.create = AsyncMock(side_effect=mock_create) + mock_create_client.return_value = mock_client + + start_time = time.time() + await asyncio.gather(do_inference(), do_inference(), do_inference(), do_inference()) + total_time = time.time() - start_time + + assert mock_create_client.call_count == 4 # no cheating + assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max" From c7ef1f13df981622216833578c70d98f702d9cc6 Mon Sep 17 00:00:00 2001 From: slekkala1 Date: Thu, 11 Sep 2025 11:10:41 -0700 Subject: [PATCH 21/30] feat: Add langchain llamastack Integration example notebook (#3314) # What does this PR do? The notebook was reverted(https://github.com/llamastack/llama-stack/pull/3259) as it had some local paths, I missed correcting. Trying with corrections now ## Test Plan Ran the Jupyter notebook --- .../langchain/Llama_Stack_LangChain.ipynb | 701 ++++++++++++++++++ 1 file changed, 701 insertions(+) create mode 100644 docs/notebooks/langchain/Llama_Stack_LangChain.ipynb diff --git a/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb new file mode 100644 index 000000000..d44ac6994 --- /dev/null +++ b/docs/notebooks/langchain/Llama_Stack_LangChain.ipynb @@ -0,0 +1,701 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1ztegmwm4sp", + "metadata": {}, + "source": [ + "## LlamaStack + LangChain Integration Tutorial\n", + "\n", + "This notebook demonstrates how to integrate **LlamaStack** with **LangChain** to build a complete RAG (Retrieval-Augmented Generation) system.\n", + "\n", + "### Overview\n", + "\n", + "- **LlamaStack**: Provides the infrastructure for running LLMs and Open AI Compatible Vector Stores\n", + "- **LangChain**: Provides the framework for chaining operations and prompt templates\n", + "- **Integration**: Uses LlamaStack's OpenAI-compatible API with LangChain\n", + "\n", + "### What You'll See\n", + "\n", + "1. Setting up LlamaStack server with Fireworks AI provider\n", + "2. Creating and Querying Vector Stores\n", + "3. Building RAG chains with LangChain + LLAMAStack\n", + "4. Querying the chain for relevant information\n", + "\n", + "### Prerequisites\n", + "\n", + "- Fireworks API key\n", + "\n", + "---\n", + "\n", + "### 1. Installation and Setup" + ] + }, + { + "cell_type": "markdown", + "id": "2ktr5ls2cas", + "metadata": {}, + "source": [ + "#### Install Required Dependencies\n", + "\n", + "First, we install all the necessary packages for LangChain and FastAPI integration." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5b6a6a17-b931-4bea-8273-0d6e5563637a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: uv in /Users/swapna942/miniconda3/lib/python3.12/site-packages (0.7.20)\n", + "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n", + "\u001b[2mAudited \u001b[1m7 packages\u001b[0m \u001b[2min 42ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install uv\n", + "!uv pip install fastapi uvicorn \"langchain>=0.2\" langchain-openai \\\n", + " langchain-community langchain-text-splitters \\\n", + " faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "id": "wmt9jvqzh7n", + "metadata": {}, + "source": [ + "### 2. LlamaStack Server Setup\n", + "\n", + "#### Build and Start LlamaStack Server\n", + "\n", + "This section sets up the LlamaStack server with:\n", + "- **Fireworks AI** as the inference provider\n", + "- **Sentence Transformers** for embeddings\n", + "\n", + "The server runs on `localhost:8321` and provides OpenAI-compatible endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dd2dacf3-ec8b-4cc7-8ff4-b5b6ea4a6e9e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import subprocess\n", + "import time\n", + "\n", + "# Remove UV_SYSTEM_PYTHON to ensure uv creates a proper virtual environment\n", + "# instead of trying to use system Python globally, which could cause permission issues\n", + "# and package conflicts with the system's Python installation\n", + "if \"UV_SYSTEM_PYTHON\" in os.environ:\n", + " del os.environ[\"UV_SYSTEM_PYTHON\"]\n", + "\n", + "def run_llama_stack_server_background():\n", + " \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n", + " log_file = open(\"llama_stack_server.log\", \"w\")\n", + " process = subprocess.Popen(\n", + " \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n", + " shell=True,\n", + " stdout=log_file,\n", + " stderr=log_file,\n", + " text=True,\n", + " )\n", + "\n", + " print(f\"Building and starting Llama Stack server with PID: {process.pid}\")\n", + " return process\n", + "\n", + "\n", + "def wait_for_server_to_start():\n", + " import requests\n", + " from requests.exceptions import ConnectionError\n", + "\n", + " url = \"http://0.0.0.0:8321/v1/health\"\n", + " max_retries = 30\n", + " retry_interval = 1\n", + "\n", + " print(\"Waiting for server to start\", end=\"\")\n", + " for _ in range(max_retries):\n", + " try:\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " print(\"\\nServer is ready!\")\n", + " return True\n", + " except ConnectionError:\n", + " print(\".\", end=\"\", flush=True)\n", + " time.sleep(retry_interval)\n", + "\n", + " print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n", + " return False\n", + "\n", + "\n", + "def kill_llama_stack_server():\n", + " # Kill any existing llama stack server processes using pkill command\n", + " os.system(\"pkill -f llama_stack.core.server.server\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "28bd8dbd-4576-4e76-813f-21ab94db44a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Building and starting Llama Stack server with PID: 19747\n", + "Waiting for server to start....\n", + "Server is ready!\n" + ] + } + ], + "source": [ + "server_process = run_llama_stack_server_background()\n", + "assert wait_for_server_to_start()" + ] + }, + { + "cell_type": "markdown", + "id": "gr9cdcg4r7n", + "metadata": {}, + "source": [ + "#### Install LlamaStack Client\n", + "\n", + "Install the client library to interact with the LlamaStack server." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "487d2dbc-d071-400e-b4f0-dcee58f8dc95", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 27ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "!uv pip install llama_stack_client" + ] + }, + { + "cell_type": "markdown", + "id": "0j5hag7l9x89", + "metadata": {}, + "source": [ + "### 3. Initialize LlamaStack Client\n", + "\n", + "Create a client connection to the LlamaStack server with API keys for different providers:\n", + "\n", + "- **Fireworks API Key**: For Fireworks models\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ab4eff97-4565-4c73-b1b3-0020a4c7e2a5", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client import LlamaStackClient\n", + "\n", + "client = LlamaStackClient(\n", + " base_url=\"http://0.0.0.0:8321\",\n", + " provider_data={\"fireworks_api_key\": \"***\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "vwhexjy1e8o", + "metadata": {}, + "source": [ + "#### Explore Available Models and Safety Features\n", + "\n", + "Check what models and safety shields are available through your LlamaStack instance." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "880443ef-ac3c-48b1-a80a-7dab5b25ac61", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/shields \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Fireworks models:\n", + "- fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p1-70b-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p1-405b-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p2-3b-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p2-11b-vision-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct\n", + "- fireworks/accounts/fireworks/models/llama-v3p3-70b-instruct\n", + "- fireworks/accounts/fireworks/models/llama4-scout-instruct-basic\n", + "- fireworks/accounts/fireworks/models/llama4-maverick-instruct-basic\n", + "- fireworks/nomic-ai/nomic-embed-text-v1.5\n", + "- fireworks/accounts/fireworks/models/llama-guard-3-8b\n", + "- fireworks/accounts/fireworks/models/llama-guard-3-11b-vision\n", + "----\n", + "Available shields (safety models):\n", + "code-scanner\n", + "llama-guard\n", + "nemo-guardrail\n", + "----\n" + ] + } + ], + "source": [ + "print(\"Available Fireworks models:\")\n", + "for m in client.models.list():\n", + " if m.identifier.startswith(\"fireworks/\"):\n", + " print(f\"- {m.identifier}\")\n", + "\n", + "print(\"----\")\n", + "print(\"Available shields (safety models):\")\n", + "for s in client.shields.list():\n", + " print(s.identifier)\n", + "print(\"----\")" + ] + }, + { + "cell_type": "markdown", + "id": "gojp7at31ht", + "metadata": {}, + "source": [ + "### 4. Vector Store Setup\n", + "\n", + "#### Create a Vector Store with File Upload\n", + "\n", + "Create a vector store using the OpenAI-compatible vector stores API:\n", + "\n", + "- **Vector Store**: OpenAI-compatible vector store for document storage\n", + "- **File Upload**: Automatic chunking and embedding of uploaded files \n", + "- **Embedding Model**: Sentence Transformers model for text embeddings\n", + "- **Dimensions**: 384-dimensional embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "be2c2899-ea53-4e5f-b6b8-ed425f5d6572", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File(id='file-54652c95c56c4c34918a97d7ff8a4320', bytes=41, created_at=1757442621, expires_at=1788978621, filename='shipping_policy.txt', object='file', purpose='assistants')\n", + "File(id='file-fb1227c1d1854da1bd774d21e5b7e41c', bytes=48, created_at=1757442621, expires_at=1788978621, filename='returns_policy.txt', object='file', purpose='assistants')\n", + "File(id='file-673f874852fe42798675a13d06a256e2', bytes=45, created_at=1757442621, expires_at=1788978621, filename='support.txt', object='file', purpose='assistants')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores \"HTTP/1.1 200 OK\"\n" + ] + } + ], + "source": [ + "from io import BytesIO\n", + "\n", + "docs = [\n", + " (\"Acme ships globally in 3-5 business days.\", {\"title\": \"Shipping Policy\"}),\n", + " (\"Returns are accepted within 30 days of purchase.\", {\"title\": \"Returns Policy\"}),\n", + " (\"Support is available 24/7 via chat and email.\", {\"title\": \"Support\"}),\n", + "]\n", + "\n", + "file_ids = []\n", + "for content, metadata in docs:\n", + " with BytesIO(content.encode()) as file_buffer:\n", + " file_buffer.name = f\"{metadata['title'].replace(' ', '_').lower()}.txt\"\n", + " create_file_response = client.files.create(file=file_buffer, purpose=\"assistants\")\n", + " print(create_file_response)\n", + " file_ids.append(create_file_response.id)\n", + "\n", + "# Create vector store with files\n", + "vector_store = client.vector_stores.create(\n", + " name=\"acme_docs\",\n", + " file_ids=file_ids,\n", + " embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n", + " embedding_dimension=384,\n", + " provider_id=\"faiss\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9061tmi1zpq", + "metadata": {}, + "source": [ + "#### Test Vector Store Search\n", + "\n", + "Query the vector store. This performs semantic search to find relevant documents based on the query." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ba9d1901-bd5e-4216-b3e6-19dc74551cc6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Acme ships globally in 3-5 business days.\n", + "Returns are accepted within 30 days of purchase.\n" + ] + } + ], + "source": [ + "search_response = client.vector_stores.search(\n", + " vector_store_id=vector_store.id,\n", + " query=\"How long does shipping take?\",\n", + " max_num_results=2\n", + ")\n", + "for result in search_response.data:\n", + " content = result.content[0].text\n", + " print(content)" + ] + }, + { + "cell_type": "markdown", + "id": "usne6mbspms", + "metadata": {}, + "source": [ + "### 5. LangChain Integration\n", + "\n", + "#### Configure LangChain with LlamaStack\n", + "\n", + "Set up LangChain to use LlamaStack's OpenAI-compatible API:\n", + "\n", + "- **Base URL**: Points to LlamaStack's OpenAI endpoint\n", + "- **Headers**: Include Fireworks API key for model access\n", + "- **Model**: Use Meta Llama v3p1 8b instruct model for inference" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c378bd10-09c2-417c-bdfc-1e0a2dd19084", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "# Point LangChain to Llamastack Server\n", + "llm = ChatOpenAI(\n", + " base_url=\"http://0.0.0.0:8321/v1/openai/v1\",\n", + " api_key=\"dummy\",\n", + " model=\"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\",\n", + " default_headers={\"X-LlamaStack-Provider-Data\": '{\"fireworks_api_key\": \"***\"}'},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5a4ddpcuk3l", + "metadata": {}, + "source": [ + "#### Test LLM Connection\n", + "\n", + "Verify that LangChain can successfully communicate with the LlamaStack server." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f88ffb5a-657b-4916-9375-c6ddc156c25e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "data": { + "text/plain": [ + "AIMessage(content=\"A llama's gentle eyes shine bright,\\nIn the Andes, it roams through morning light.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': None, 'model_name': 'fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct', 'system_fingerprint': None, 'id': 'chatcmpl-602b5967-82a3-476b-9cd2-7d3b29b76ee8', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--0933c465-ff4d-4a7b-b7fb-fd97dd8244f3-0')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test llm with simple message\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n", + "]\n", + "llm.invoke(messages)" + ] + }, + { + "cell_type": "markdown", + "id": "0xh0jg6a0l4a", + "metadata": {}, + "source": [ + "### 6. Building the RAG Chain\n", + "\n", + "#### Create a Complete RAG Pipeline\n", + "\n", + "Build a LangChain pipeline that combines:\n", + "\n", + "1. **Vector Search**: Query LlamaStack's Open AI compatible Vector Store\n", + "2. **Context Assembly**: Format retrieved documents\n", + "3. **Prompt Template**: Structure the input for the LLM\n", + "4. **LLM Generation**: Generate answers using context\n", + "5. **Output Parsing**: Extract the final response\n", + "\n", + "**Chain Flow**: `Query → Vector Search → Context + Question → LLM → Response`" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9684427d-dcc7-4544-9af5-8b110d014c42", + "metadata": {}, + "outputs": [], + "source": [ + "# LangChain for prompt template and chaining + LLAMA Stack Client Vector DB and LLM chat completion\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n", + "\n", + "\n", + "def join_docs(docs):\n", + " return \"\\n\\n\".join([f\"[{d.filename}] {d.content[0].text}\" for d in docs.data])\n", + "\n", + "PROMPT = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", \"You are a helpful assistant. Use the following context to answer.\"),\n", + " (\"user\", \"Question: {question}\\n\\nContext:\\n{context}\"),\n", + " ]\n", + ")\n", + "\n", + "vector_step = RunnableLambda(\n", + " lambda x: client.vector_stores.search(\n", + " vector_store_id=vector_store.id,\n", + " query=x,\n", + " max_num_results=2\n", + " )\n", + " )\n", + "\n", + "chain = (\n", + " {\"context\": vector_step | RunnableLambda(join_docs), \"question\": RunnablePassthrough()}\n", + " | PROMPT\n", + " | llm\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0onu6rhphlra", + "metadata": {}, + "source": [ + "### 7. Testing the RAG System\n", + "\n", + "#### Example 1: Shipping Query" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "03322188-9509-446a-a4a8-ce3bb83ec87c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "❓ How long does shipping take?\n", + "💡 Acme ships globally in 3-5 business days. This means that shipping typically takes between 3 to 5 working days from the date of dispatch or order fulfillment.\n" + ] + } + ], + "source": [ + "query = \"How long does shipping take?\"\n", + "response = chain.invoke(query)\n", + "print(\"❓\", query)\n", + "print(\"💡\", response)" + ] + }, + { + "cell_type": "markdown", + "id": "b7krhqj88ku", + "metadata": {}, + "source": [ + "#### Example 2: Returns Policy Query" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "61995550-bb0b-46a8-a5d0-023207475d60", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "❓ Can I return a product after 40 days?\n", + "💡 Based on the provided context, you cannot return a product after 40 days. The return window is limited to 30 days from the date of purchase.\n" + ] + } + ], + "source": [ + "query = \"Can I return a product after 40 days?\"\n", + "response = chain.invoke(query)\n", + "print(\"❓\", query)\n", + "print(\"💡\", response)" + ] + }, + { + "cell_type": "markdown", + "id": "h4w24fadvjs", + "metadata": {}, + "source": [ + "---\n", + "We have successfully built a RAG system that combines:\n", + "\n", + "- **LlamaStack** for infrastructure (LLM serving + Vector Store)\n", + "- **LangChain** for orchestration (prompts + chains)\n", + "- **Fireworks** for high-quality language models\n", + "\n", + "### Key Benefits\n", + "\n", + "1. **Unified Infrastructure**: Single server for LLMs and Vector Store\n", + "2. **OpenAI Compatibility**: Easy integration with existing LangChain code\n", + "3. **Multi-Provider Support**: Switch between different LLM providers\n", + "4. **Production Ready**: Built-in safety shields and monitoring\n", + "\n", + "### Next Steps\n", + "\n", + "- Add more sophisticated document processing\n", + "- Implement conversation memory\n", + "- Add safety filtering and monitoring\n", + "- Scale to larger document collections\n", + "- Integrate with web frameworks like FastAPI or Streamlit\n", + "\n", + "---\n", + "\n", + "##### 🔧 Cleanup\n", + "\n", + "Don't forget to stop the LlamaStack server when you're done:\n", + "\n", + "```python\n", + "kill_llama_stack_server()\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "15647c46-22ce-4698-af3f-8161329d8e3a", + "metadata": {}, + "outputs": [], + "source": [ + "kill_llama_stack_server()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 69a52213a190bddcf118bb13206353ff4b30d33d Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Thu, 11 Sep 2025 16:30:09 -0400 Subject: [PATCH 22/30] fix: oasdiff enhancements and stability (#3419) # What does this PR do? only run conformance tests when the spec is changed. Also, cache oasdiff such that it is not installed every time the test is run Signed-off-by: Charlie Doern --- .github/workflows/conformance.yml | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml index c0a7795a3..c7962c93d 100644 --- a/.github/workflows/conformance.yml +++ b/.github/workflows/conformance.yml @@ -13,11 +13,8 @@ on: branches: [ main ] types: [opened, synchronize, reopened] paths: - - 'llama_stack/**' - - '!llama_stack/ui/**' - - 'tests/**' - - 'uv.lock' - - 'pyproject.toml' + - 'docs/_static/llama-stack-spec.yaml' + - 'docs/_static/llama-stack-spec.html' - '.github/workflows/conformance.yml' # This workflow itself concurrency: @@ -43,10 +40,27 @@ jobs: ref: ${{ github.event.pull_request.base.ref }} path: 'base' + # Cache oasdiff to avoid checksum failures and speed up builds + - name: Cache oasdiff + id: cache-oasdiff + uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 + with: + path: ~/oasdiff + key: oasdiff-${{ runner.os }} + # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs. - name: Install oasdiff + if: steps.cache-oasdiff.outputs.cache-hit != 'true' run: | curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh + cp /usr/local/bin/oasdiff ~/oasdiff + + # Setup cached oasdiff + - name: Setup cached oasdiff + if: steps.cache-oasdiff.outputs.cache-hit == 'true' + run: | + sudo cp ~/oasdiff /usr/local/bin/oasdiff + sudo chmod +x /usr/local/bin/oasdiff # Run oasdiff to detect breaking changes in the API specification # This step will fail if incompatible changes are detected, preventing breaking changes from being merged From d31e641d6902dd1f43a3cc034af31a58ac135425 Mon Sep 17 00:00:00 2001 From: Akram Ben Aissi Date: Fri, 12 Sep 2025 10:10:59 +0100 Subject: [PATCH 23/30] fix: Improve pre-commit workflow error handling and feedback (#3400) # What does this PR do? fix: Improve pre-commit workflow error handling and feedback - Add explicit step to check pre-commit results and provide clear error messages - Improve verification steps with better error messages and file listings - Use GitHub Actions annotations (::error:: and ::warning::) for better visibility - Maintain continue-on-error for pre-commit step but add proper failure handling This addresses the issue where pre-commit failures were silent but still caused workflow failures later, making it difficult to understand what needed to be fixed. ## Test Plan Signed-off-by: Akram Ben Aissi --- .github/workflows/pre-commit.yml | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 000208043..b5845be53 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -47,11 +47,21 @@ jobs: run: npm ci working-directory: llama_stack/ui - - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + - name: Run pre-commit + id: precommit + uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + continue-on-error: true env: SKIP: no-commit-to-branch RUFF_OUTPUT_FORMAT: github + - name: Check pre-commit results + if: steps.precommit.outcome == 'failure' + run: | + echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes." + echo "::warning::Some pre-commit hooks failed. Check the output above for details." + exit 1 + - name: Debug run: | echo "github.ref: ${{ github.ref }}" @@ -79,17 +89,23 @@ jobs: echo "No changes to commit" fi - - name: Verify if there are any diff files after pre-commit + - name: Verify no uncommitted changes if: github.actor != 'dependabot[bot]' run: | - git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1) + if ! git diff --exit-code; then + echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes." + echo "::warning::Files with changes:" + git diff --name-status + exit 1 + fi - name: Verify if there are any new files after pre-commit if: github.actor != 'dependabot[bot]' run: | unstaged_files=$(git ls-files --others --exclude-standard) if [ -n "$unstaged_files" ]; then - echo "There are uncommitted new files, run pre-commit locally and commit again" + echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes." + echo "::warning::New files:" echo "$unstaged_files" exit 1 fi From f67081d2d6088a0d3175baffad94977ddf8f6483 Mon Sep 17 00:00:00 2001 From: Doug Edgar Date: Fri, 12 Sep 2025 02:18:19 -0700 Subject: [PATCH 24/30] feat: migrate to FIPS-validated cryptographic algorithms (#3423) # What does this PR do? Migrates MD5 and SHA-1 hash algorithms to SHA-256. In particular, replaces: - MD5 in chunk ID generation. - MD5 in file verification. - SHA-1 in model identifier digests. And updates all related test expectations. Original discussion: https://github.com/llamastack/llama-stack/discussions/3413 Closes #3424. ## Test Plan Unit tests from scripts/unit-tests.sh were updated to match the new hash output, and ran to verify the tests pass. Signed-off-by: Doug Edgar --- llama_stack/cli/verify_download.py | 17 +++++++---------- .../providers/utils/vector_io/vector_utils.py | 6 ++---- llama_stack/testing/inference_recorder.py | 2 +- .../providers/vector_io/test_vector_utils.py | 12 ++++++------ 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py index b7f4cfdb5..e738abb4f 100644 --- a/llama_stack/cli/verify_download.py +++ b/llama_stack/cli/verify_download.py @@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None: parser.set_defaults(func=partial(run_verify_cmd, parser=parser)) -def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str: - # NOTE: MD5 is used here only for download integrity verification, - # not for security purposes - # TODO: switch to SHA256 - md5_hash = hashlib.md5(usedforsecurity=False) +def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str: + sha256_hash = hashlib.sha256() with open(filepath, "rb") as f: for chunk in iter(lambda: f.read(chunk_size), b""): - md5_hash.update(chunk) - return md5_hash.hexdigest() + sha256_hash.update(chunk) + return sha256_hash.hexdigest() def load_checksums(checklist_path: Path) -> dict[str, str]: @@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]: with open(checklist_path) as f: for line in f: if line.strip(): - md5sum, filepath = line.strip().split(" ", 1) + sha256sum, filepath = line.strip().split(" ", 1) # Remove leading './' if present filepath = filepath.lstrip("./") - checksums[filepath] = md5sum + checksums[filepath] = sha256sum return checksums @@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) - matches = False if exists: - actual_hash = calculate_md5(full_path) + actual_hash = calculate_sha256(full_path) matches = actual_hash == expected_hash results.append( diff --git a/llama_stack/providers/utils/vector_io/vector_utils.py b/llama_stack/providers/utils/vector_io/vector_utils.py index e55ac75ae..324f35405 100644 --- a/llama_stack/providers/utils/vector_io/vector_utils.py +++ b/llama_stack/providers/utils/vector_io/vector_utils.py @@ -12,14 +12,12 @@ import uuid def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str: """ Generate a unique chunk ID using a hash of the document ID and chunk text. - - Note: MD5 is used only to calculate an identifier, not for security purposes. - Adding usedforsecurity=False for compatibility with FIPS environments. + Then use the first 32 characters of the hash to create a UUID. """ hash_input = f"{document_id}:{chunk_text}".encode() if chunk_window: hash_input += f":{chunk_window}".encode() - return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest())) + return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32])) def proper_case(s: str) -> str: diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index e78f493a6..6f017c51d 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -211,7 +211,7 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str: return sorted(set(idents)) identifiers = _extract_model_identifiers() - return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8] + return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8] def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None: diff --git a/tests/unit/providers/vector_io/test_vector_utils.py b/tests/unit/providers/vector_io/test_vector_utils.py index a5d803a82..10ebe5bfb 100644 --- a/tests/unit/providers/vector_io/test_vector_utils.py +++ b/tests/unit/providers/vector_io/test_vector_utils.py @@ -26,9 +26,9 @@ def test_generate_chunk_id(): chunk_ids = sorted([chunk.chunk_id for chunk in chunks]) assert chunk_ids == [ - "177a1368-f6a8-0c50-6e92-18677f2c3de3", - "bc744db3-1b25-0a9c-cdff-b6ba3df73c36", - "f68df25d-d9aa-ab4d-5684-64a233add20d", + "31d1f9a3-c8d2-66e7-3c37-af2acd329778", + "d07dade7-29c0-cda7-df29-0249a1dcbc3e", + "d14f75a1-5855-7f72-2c78-d9fc4275a346", ] @@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window(): chunk = Chunk(content="test", metadata={"document_id": "doc-1"}) chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1") chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2") - assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb" - assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154" + assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866" + assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685" def test_chunk_id(): # Test with existing chunk ID chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"}) - assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350" + assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd" # Test with document ID in metadata chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"}) From 3de9ad0a87d7bfad50ab23c859cebcaf06b6911b Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Fri, 12 Sep 2025 17:59:56 -0400 Subject: [PATCH 25/30] chore(recorder, tests): add test for openai /v1/models (#3426) # What does this PR do? - [x] adds a test for the recorder's handling of /v1/models - [x] adds a fix for /v1/models handling ## Test Plan ci --- llama_stack/testing/inference_recorder.py | 60 ++++++++++--------- .../distribution/test_inference_recordings.py | 51 ++++++++++++++-- 2 files changed, 79 insertions(+), 32 deletions(-) diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index 6f017c51d..745160976 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -7,6 +7,7 @@ from __future__ import annotations # for forward references import hashlib +import inspect import json import os from collections.abc import Generator @@ -198,16 +199,11 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str: Supported endpoints: - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ] - - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ] + - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ] Returns a list of unique identifiers or None if structure doesn't match. """ - body = response["body"] - if endpoint == "/api/tags": - items = body.get("models") - idents = [m.model for m in items] - else: - items = body.get("data") - idents = [m.id for m in items] + items = response["body"] + idents = [m.model if endpoint == "/api/tags" else m.id for m in items] return sorted(set(idents)) identifiers = _extract_model_identifiers() @@ -219,28 +215,22 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) seen: dict[str, dict[str, Any]] = {} for rec in records: body = rec["response"]["body"] - if endpoint == "/api/tags": - items = body.models - elif endpoint == "/v1/models": - items = body.data - else: - items = [] - - for m in items: - if endpoint == "/v1/models": + if endpoint == "/v1/models": + for m in body: key = m.id - else: + seen[key] = m + elif endpoint == "/api/tags": + for m in body.models: key = m.model - seen[key] = m + seen[key] = m ordered = [seen[k] for k in sorted(seen.keys())] canonical = records[0] canonical_req = canonical.get("request", {}) if isinstance(canonical_req, dict): canonical_req["endpoint"] = endpoint - if endpoint == "/v1/models": - body = {"data": ordered, "object": "list"} - else: + body = ordered + if endpoint == "/api/tags": from ollama import ListResponse body = ListResponse(models=ordered) @@ -252,7 +242,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint if _current_mode == InferenceMode.LIVE or _current_storage is None: # Normal operation - return await original_method(self, *args, **kwargs) + if inspect.iscoroutinefunction(original_method): + return await original_method(self, *args, **kwargs) + else: + return original_method(self, *args, **kwargs) # Get base URL based on client type if client_type == "openai": @@ -300,7 +293,14 @@ async def _patched_inference_method(original_method, self, client_type, endpoint ) elif _current_mode == InferenceMode.RECORD: - response = await original_method(self, *args, **kwargs) + if inspect.iscoroutinefunction(original_method): + response = await original_method(self, *args, **kwargs) + else: + response = original_method(self, *args, **kwargs) + + # we want to store the result of the iterator, not the iterator itself + if endpoint == "/v1/models": + response = [m async for m in response] request_data = { "method": method, @@ -380,10 +380,14 @@ def patch_inference_clients(): _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs ) - async def patched_models_list(self, *args, **kwargs): - return await _patched_inference_method( - _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs - ) + def patched_models_list(self, *args, **kwargs): + async def _iter(): + for item in await _patched_inference_method( + _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs + ): + yield item + + return _iter() # Apply OpenAI patches AsyncChatCompletions.create = patched_chat_completions_create diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py index c69cf319b..94fd2536e 100644 --- a/tests/unit/distribution/test_inference_recordings.py +++ b/tests/unit/distribution/test_inference_recordings.py @@ -6,10 +6,11 @@ import tempfile from pathlib import Path -from unittest.mock import patch +from unittest.mock import AsyncMock, Mock, patch import pytest from openai import AsyncOpenAI +from openai.types.model import Model as OpenAIModel # Import the real Pydantic response types instead of using Mocks from llama_stack.apis.inference import ( @@ -158,7 +159,9 @@ class TestInferenceRecording: return real_openai_chat_response temp_storage_dir = temp_storage_dir / "test_recording_mode" - with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create): + with patch( + "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create + ): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") @@ -184,7 +187,9 @@ class TestInferenceRecording: temp_storage_dir = temp_storage_dir / "test_replay_mode" # First, record a response - with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create): + with patch( + "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create + ): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") @@ -213,6 +218,42 @@ class TestInferenceRecording: # Verify the original method was NOT called mock_create_patch.assert_not_called() + async def test_replay_mode_models(self, temp_storage_dir): + """Test that replay mode returns stored responses without making real model listing calls.""" + + async def _async_iterator(models): + for model in models: + yield model + + models = [ + OpenAIModel(id="foo", created=1, object="model", owned_by="test"), + OpenAIModel(id="bar", created=2, object="model", owned_by="test"), + ] + + expected_ids = {m.id for m in models} + + temp_storage_dir = temp_storage_dir / "test_replay_mode_models" + + # baseline - mock works without recording + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.models._get_api_list = Mock(return_value=_async_iterator(models)) + assert {m.id async for m in client.models.list()} == expected_ids + client.models._get_api_list.assert_called_once() + + # record the call + with inference_recording(mode=InferenceMode.RECORD, storage_dir=temp_storage_dir): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.models._get_api_list = Mock(return_value=_async_iterator(models)) + assert {m.id async for m in client.models.list()} == expected_ids + client.models._get_api_list.assert_called_once() + + # replay the call + with inference_recording(mode=InferenceMode.REPLAY, storage_dir=temp_storage_dir): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.models._get_api_list = Mock(return_value=_async_iterator(models)) + assert {m.id async for m in client.models.list()} == expected_ids + client.models._get_api_list.assert_not_called() + async def test_replay_missing_recording(self, temp_storage_dir): """Test that replay mode fails when no recording is found.""" temp_storage_dir = temp_storage_dir / "test_replay_missing_recording" @@ -233,7 +274,9 @@ class TestInferenceRecording: temp_storage_dir = temp_storage_dir / "test_embeddings_recording" # Record - with patch("openai.resources.embeddings.AsyncEmbeddings.create", side_effect=mock_create): + with patch( + "openai.resources.embeddings.AsyncEmbeddings.create", new_callable=AsyncMock, side_effect=mock_create + ): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") From 8cf2128b40195634c4024e3c797eceaaa4da19bc Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 13 Sep 2025 12:28:04 -0400 Subject: [PATCH 26/30] chore(tests): always show slowest tests (#3431) # What does this PR do? help developers identify slow tests by always passing --duration to pytest ## Test Plan n/a --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 72c4f6f9e..ce95b758f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -354,6 +354,7 @@ warn_required_dynamic_aliases = true classmethod-decorators = ["classmethod", "pydantic.field_validator"] [tool.pytest.ini_options] +addopts = ["--durations=10"] asyncio_mode = "auto" markers = [ "allow_network: Allow network access for specific unit tests", From 6787755c0c8af6b59322352f985cffb224aadd3b Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 13 Sep 2025 14:11:38 -0400 Subject: [PATCH 27/30] chore(recorder): add support for NOT_GIVEN (#3430) # What does this PR do? the recorder mocks the openai-python interface. the openai-python interface allows NOT_GIVEN as an input option. this change properly handles NOT_GIVEN. ## Test Plan ci (coverage for chat, completions, embeddings) --- llama_stack/testing/inference_recorder.py | 5 ++ .../distribution/test_inference_recordings.py | 65 ++++++++++++++++++- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index 745160976..f899d73d3 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -16,6 +16,8 @@ from enum import StrEnum from pathlib import Path from typing import Any, Literal, cast +from openai import NOT_GIVEN + from llama_stack.log import get_logger logger = get_logger(__name__, category="testing") @@ -250,6 +252,9 @@ async def _patched_inference_method(original_method, self, client_type, endpoint # Get base URL based on client type if client_type == "openai": base_url = str(self._client.base_url) + + # the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out + kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN} elif client_type == "ollama": # Get base URL from the client (Ollama client uses host attribute) base_url = getattr(self, "host", "http://localhost:11434") diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py index 94fd2536e..4909bbe1e 100644 --- a/tests/unit/distribution/test_inference_recordings.py +++ b/tests/unit/distribution/test_inference_recordings.py @@ -9,7 +9,7 @@ from pathlib import Path from unittest.mock import AsyncMock, Mock, patch import pytest -from openai import AsyncOpenAI +from openai import NOT_GIVEN, AsyncOpenAI from openai.types.model import Model as OpenAIModel # Import the real Pydantic response types instead of using Mocks @@ -17,6 +17,7 @@ from llama_stack.apis.inference import ( OpenAIAssistantMessageParam, OpenAIChatCompletion, OpenAIChoice, + OpenAICompletion, OpenAIEmbeddingData, OpenAIEmbeddingsResponse, OpenAIEmbeddingUsage, @@ -170,6 +171,7 @@ class TestInferenceRecording: messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=0.7, max_tokens=50, + user=NOT_GIVEN, ) # Verify the response was returned correctly @@ -198,6 +200,7 @@ class TestInferenceRecording: messages=[{"role": "user", "content": "Hello, how are you?"}], temperature=0.7, max_tokens=50, + user=NOT_GIVEN, ) # Now test replay mode - should not call the original method @@ -281,7 +284,11 @@ class TestInferenceRecording: client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") response = await client.embeddings.create( - model="nomic-embed-text", input=["Hello world", "Test embedding"] + model=real_embeddings_response.model, + input=["Hello world", "Test embedding"], + encoding_format=NOT_GIVEN, + dimensions=NOT_GIVEN, + user=NOT_GIVEN, ) assert len(response.data) == 2 @@ -292,7 +299,8 @@ class TestInferenceRecording: client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") response = await client.embeddings.create( - model="nomic-embed-text", input=["Hello world", "Test embedding"] + model=real_embeddings_response.model, + input=["Hello world", "Test embedding"], ) # Verify we got the recorded response @@ -302,6 +310,57 @@ class TestInferenceRecording: # Verify original method was not called mock_create_patch.assert_not_called() + async def test_completions_recording(self, temp_storage_dir): + real_completions_response = OpenAICompletion( + id="test_completion", + object="text_completion", + created=1234567890, + model="llama3.2:3b", + choices=[ + { + "text": "Hello! I'm doing well, thank you for asking.", + "index": 0, + "logprobs": None, + "finish_reason": "stop", + } + ], + ) + + async def mock_create(*args, **kwargs): + return real_completions_response + + temp_storage_dir = temp_storage_dir / "test_completions_recording" + + # Record + with patch( + "openai.resources.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create + ): + with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + + response = await client.completions.create( + model=real_completions_response.model, + prompt="Hello, how are you?", + temperature=0.7, + max_tokens=50, + user=NOT_GIVEN, + ) + + assert response.choices[0].text == real_completions_response.choices[0].text + + # Replay + with patch("openai.resources.completions.AsyncCompletions.create") as mock_create_patch: + with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + response = await client.completions.create( + model=real_completions_response.model, + prompt="Hello, how are you?", + temperature=0.7, + max_tokens=50, + ) + assert response.choices[0].text == real_completions_response.choices[0].text + mock_create_patch.assert_not_called() + async def test_live_mode(self, real_openai_chat_response): """Test that live mode passes through to original methods.""" From 36fd97e306d14cbb5eba7c18ce93dcb05bdf9206 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 09:46:05 +0200 Subject: [PATCH 28/30] chore(ui-deps): bump next from 15.3.3 to 15.5.3 in /llama_stack/ui (#3438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [next](https://github.com/vercel/next.js) from 15.3.3 to 15.5.3.
Release notes

Sourced from next's releases.

v15.5.3

[!NOTE]
This release is backporting bug fixes. It does not include all pending features/changes on canary.

Core Changes

  • fix: validation return types of pages API routes (#83069)
  • fix: relative paths in dev in validator.ts (#83073)
  • fix: remove satisfies keyword from type validation to preserve old TS compatibility (#83071)

Credits

Huge thanks to @​bgub for helping!

v15.5.2

[!NOTE]
This release is backporting bug fixes. It does not include all pending features/changes on canary.

Core Changes

  • fix: disable unknownatrules lint rule entirely (#83059)
  • revert: add ?dpl to fonts in /_next/static/media (#83062)

Credits

Huge thanks to @​bgub and @​ztanner for helping!

v15.5.1

[!NOTE]
This release is backporting bug fixes. It does not include all pending features/changes on canary.

Core Changes

  • fix: aliased navigations should apply scroll handling (#82900)
  • Turbopack: fix invalid NFT entry with file behind symlink (#82887)
  • fix: typesafe linking to route handlers and pages API routes (#82858)
  • fix: change "noUnknownAtRules" to "warn" for Biome (#82974)
  • fix: add path normalization to getRelativePath for Windows (#82918)
  • feat: add typesafety with config.typedRoutes to redirect() and permanentRedirect() (#82860)
  • fix: avoid importing types that will be unused (#82856)
  • fix: update the config.api.responseLimit type (#82852)
  • fix: update validation return types (#82854)

Credits

Huge thanks to @​bgub, @​mischnic, and @​ztanner for helping!

v15.5.1-canary.39

Core Changes

  • [metadata] change the metadata routes params to promises: #83560

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=next&package-manager=npm_and_yarn&previous-version=15.3.3&new-version=15.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- llama_stack/ui/package-lock.json | 370 +++++++++++++++++-------------- llama_stack/ui/package.json | 2 +- 2 files changed, 199 insertions(+), 173 deletions(-) diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json index e2c0815fd..ff73fa2e8 100644 --- a/llama_stack/ui/package-lock.json +++ b/llama_stack/ui/package-lock.json @@ -20,7 +20,7 @@ "framer-motion": "^12.23.12", "llama-stack-client": "^0.2.21", "lucide-react": "^0.542.0", - "next": "15.3.3", + "next": "15.5.3", "next-auth": "^4.24.11", "next-themes": "^0.4.6", "react": "^19.0.0", @@ -664,9 +664,9 @@ } }, "node_modules/@emnapi/runtime": { - "version": "1.4.3", - "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.3.tgz", - "integrity": "sha512-pBPWdu6MLKROBX05wSNKcNb++m5Er+KQ9QkB+WVM+pW2Kx9hoSrVTnu3BdkI5eBLZoKu/J6mW/B6i6bJB2ytXQ==", + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz", + "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==", "license": "MIT", "optional": true, "dependencies": { @@ -927,9 +927,9 @@ } }, "node_modules/@img/sharp-darwin-arm64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.1.tgz", - "integrity": "sha512-pn44xgBtgpEbZsu+lWf2KNb6OAf70X68k+yk69Ic2Xz11zHR/w24/U49XT7AeRwJ0Px+mhALhU5LPci1Aymk7A==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.3.tgz", + "integrity": "sha512-ryFMfvxxpQRsgZJqBd4wsttYQbCxsJksrv9Lw/v798JcQ8+w84mBWuXwl+TT0WJ/WrYOLaYpwQXi3sA9nTIaIg==", "cpu": [ "arm64" ], @@ -945,13 +945,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-darwin-arm64": "1.1.0" + "@img/sharp-libvips-darwin-arm64": "1.2.0" } }, "node_modules/@img/sharp-darwin-x64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.1.tgz", - "integrity": "sha512-VfuYgG2r8BpYiOUN+BfYeFo69nP/MIwAtSJ7/Zpxc5QF3KS22z8Pvg3FkrSFJBPNQ7mmcUcYQFBmEQp7eu1F8Q==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.3.tgz", + "integrity": "sha512-yHpJYynROAj12TA6qil58hmPmAwxKKC7reUqtGLzsOHfP7/rniNGTL8tjWX6L3CTV4+5P4ypcS7Pp+7OB+8ihA==", "cpu": [ "x64" ], @@ -967,13 +967,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-darwin-x64": "1.1.0" + "@img/sharp-libvips-darwin-x64": "1.2.0" } }, "node_modules/@img/sharp-libvips-darwin-arm64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.1.0.tgz", - "integrity": "sha512-HZ/JUmPwrJSoM4DIQPv/BfNh9yrOA8tlBbqbLz4JZ5uew2+o22Ik+tHQJcih7QJuSa0zo5coHTfD5J8inqj9DA==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.0.tgz", + "integrity": "sha512-sBZmpwmxqwlqG9ueWFXtockhsxefaV6O84BMOrhtg/YqbTaRdqDE7hxraVE3y6gVM4eExmfzW4a8el9ArLeEiQ==", "cpu": [ "arm64" ], @@ -987,9 +987,9 @@ } }, "node_modules/@img/sharp-libvips-darwin-x64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.1.0.tgz", - "integrity": "sha512-Xzc2ToEmHN+hfvsl9wja0RlnXEgpKNmftriQp6XzY/RaSfwD9th+MSh0WQKzUreLKKINb3afirxW7A0fz2YWuQ==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.0.tgz", + "integrity": "sha512-M64XVuL94OgiNHa5/m2YvEQI5q2cl9d/wk0qFTDVXcYzi43lxuiFTftMR1tOnFQovVXNZJ5TURSDK2pNe9Yzqg==", "cpu": [ "x64" ], @@ -1003,9 +1003,9 @@ } }, "node_modules/@img/sharp-libvips-linux-arm": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.1.0.tgz", - "integrity": "sha512-s8BAd0lwUIvYCJyRdFqvsj+BJIpDBSxs6ivrOPm/R7piTs5UIwY5OjXrP2bqXC9/moGsyRa37eYWYCOGVXxVrA==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.0.tgz", + "integrity": "sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==", "cpu": [ "arm" ], @@ -1019,9 +1019,9 @@ } }, "node_modules/@img/sharp-libvips-linux-arm64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.1.0.tgz", - "integrity": "sha512-IVfGJa7gjChDET1dK9SekxFFdflarnUB8PwW8aGwEoF3oAsSDuNUTYS+SKDOyOJxQyDC1aPFMuRYLoDInyV9Ew==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.0.tgz", + "integrity": "sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==", "cpu": [ "arm64" ], @@ -1035,9 +1035,9 @@ } }, "node_modules/@img/sharp-libvips-linux-ppc64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.1.0.tgz", - "integrity": "sha512-tiXxFZFbhnkWE2LA8oQj7KYR+bWBkiV2nilRldT7bqoEZ4HiDOcePr9wVDAZPi/Id5fT1oY9iGnDq20cwUz8lQ==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.0.tgz", + "integrity": "sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==", "cpu": [ "ppc64" ], @@ -1051,9 +1051,9 @@ } }, "node_modules/@img/sharp-libvips-linux-s390x": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.1.0.tgz", - "integrity": "sha512-xukSwvhguw7COyzvmjydRb3x/09+21HykyapcZchiCUkTThEQEOMtBj9UhkaBRLuBrgLFzQ2wbxdeCCJW/jgJA==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.0.tgz", + "integrity": "sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==", "cpu": [ "s390x" ], @@ -1067,9 +1067,9 @@ } }, "node_modules/@img/sharp-libvips-linux-x64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.1.0.tgz", - "integrity": "sha512-yRj2+reB8iMg9W5sULM3S74jVS7zqSzHG3Ol/twnAAkAhnGQnpjj6e4ayUz7V+FpKypwgs82xbRdYtchTTUB+Q==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.0.tgz", + "integrity": "sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==", "cpu": [ "x64" ], @@ -1083,9 +1083,9 @@ } }, "node_modules/@img/sharp-libvips-linuxmusl-arm64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.1.0.tgz", - "integrity": "sha512-jYZdG+whg0MDK+q2COKbYidaqW/WTz0cc1E+tMAusiDygrM4ypmSCjOJPmFTvHHJ8j/6cAGyeDWZOsK06tP33w==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.0.tgz", + "integrity": "sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==", "cpu": [ "arm64" ], @@ -1099,9 +1099,9 @@ } }, "node_modules/@img/sharp-libvips-linuxmusl-x64": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.1.0.tgz", - "integrity": "sha512-wK7SBdwrAiycjXdkPnGCPLjYb9lD4l6Ze2gSdAGVZrEL05AOUJESWU2lhlC+Ffn5/G+VKuSm6zzbQSzFX/P65A==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.0.tgz", + "integrity": "sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==", "cpu": [ "x64" ], @@ -1115,9 +1115,9 @@ } }, "node_modules/@img/sharp-linux-arm": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.1.tgz", - "integrity": "sha512-anKiszvACti2sGy9CirTlNyk7BjjZPiML1jt2ZkTdcvpLU1YH6CXwRAZCA2UmRXnhiIftXQ7+Oh62Ji25W72jA==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.3.tgz", + "integrity": "sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==", "cpu": [ "arm" ], @@ -1133,13 +1133,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linux-arm": "1.1.0" + "@img/sharp-libvips-linux-arm": "1.2.0" } }, "node_modules/@img/sharp-linux-arm64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.1.tgz", - "integrity": "sha512-kX2c+vbvaXC6vly1RDf/IWNXxrlxLNpBVWkdpRq5Ka7OOKj6nr66etKy2IENf6FtOgklkg9ZdGpEu9kwdlcwOQ==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.3.tgz", + "integrity": "sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==", "cpu": [ "arm64" ], @@ -1155,13 +1155,35 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linux-arm64": "1.1.0" + "@img/sharp-libvips-linux-arm64": "1.2.0" + } + }, + "node_modules/@img/sharp-linux-ppc64": { + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.3.tgz", + "integrity": "sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==", + "cpu": [ + "ppc64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-ppc64": "1.2.0" } }, "node_modules/@img/sharp-linux-s390x": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.1.tgz", - "integrity": "sha512-7s0KX2tI9mZI2buRipKIw2X1ufdTeaRgwmRabt5bi9chYfhur+/C1OXg3TKg/eag1W+6CCWLVmSauV1owmRPxA==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.3.tgz", + "integrity": "sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==", "cpu": [ "s390x" ], @@ -1177,13 +1199,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linux-s390x": "1.1.0" + "@img/sharp-libvips-linux-s390x": "1.2.0" } }, "node_modules/@img/sharp-linux-x64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.1.tgz", - "integrity": "sha512-wExv7SH9nmoBW3Wr2gvQopX1k8q2g5V5Iag8Zk6AVENsjwd+3adjwxtp3Dcu2QhOXr8W9NusBU6XcQUohBZ5MA==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.3.tgz", + "integrity": "sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==", "cpu": [ "x64" ], @@ -1199,13 +1221,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linux-x64": "1.1.0" + "@img/sharp-libvips-linux-x64": "1.2.0" } }, "node_modules/@img/sharp-linuxmusl-arm64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.1.tgz", - "integrity": "sha512-DfvyxzHxw4WGdPiTF0SOHnm11Xv4aQexvqhRDAoD00MzHekAj9a/jADXeXYCDFH/DzYruwHbXU7uz+H+nWmSOQ==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.3.tgz", + "integrity": "sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==", "cpu": [ "arm64" ], @@ -1221,13 +1243,13 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linuxmusl-arm64": "1.1.0" + "@img/sharp-libvips-linuxmusl-arm64": "1.2.0" } }, "node_modules/@img/sharp-linuxmusl-x64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.1.tgz", - "integrity": "sha512-pax/kTR407vNb9qaSIiWVnQplPcGU8LRIJpDT5o8PdAx5aAA7AS3X9PS8Isw1/WfqgQorPotjrZL3Pqh6C5EBg==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.3.tgz", + "integrity": "sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==", "cpu": [ "x64" ], @@ -1243,20 +1265,20 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-libvips-linuxmusl-x64": "1.1.0" + "@img/sharp-libvips-linuxmusl-x64": "1.2.0" } }, "node_modules/@img/sharp-wasm32": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.1.tgz", - "integrity": "sha512-YDybQnYrLQfEpzGOQe7OKcyLUCML4YOXl428gOOzBgN6Gw0rv8dpsJ7PqTHxBnXnwXr8S1mYFSLSa727tpz0xg==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.3.tgz", + "integrity": "sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==", "cpu": [ "wasm32" ], "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", "optional": true, "dependencies": { - "@emnapi/runtime": "^1.4.0" + "@emnapi/runtime": "^1.4.4" }, "engines": { "node": "^18.17.0 || ^20.3.0 || >=21.0.0" @@ -1265,10 +1287,29 @@ "url": "https://opencollective.com/libvips" } }, + "node_modules/@img/sharp-win32-arm64": { + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.3.tgz", + "integrity": "sha512-MjnHPnbqMXNC2UgeLJtX4XqoVHHlZNd+nPt1kRPmj63wURegwBhZlApELdtxM2OIZDRv/DFtLcNhVbd1z8GYXQ==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + } + }, "node_modules/@img/sharp-win32-ia32": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.1.tgz", - "integrity": "sha512-WKf/NAZITnonBf3U1LfdjoMgNO5JYRSlhovhRhMxXVdvWYveM4kM3L8m35onYIdh75cOMCo1BexgVQcCDzyoWw==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.3.tgz", + "integrity": "sha512-xuCdhH44WxuXgOM714hn4amodJMZl3OEvf0GVTm0BEyMeA2to+8HEdRPShH0SLYptJY1uBw+SCFP9WVQi1Q/cw==", "cpu": [ "ia32" ], @@ -1285,9 +1326,9 @@ } }, "node_modules/@img/sharp-win32-x64": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.1.tgz", - "integrity": "sha512-hw1iIAHpNE8q3uMIRCgGOeDoz9KtFNarFLQclLxr/LK1VBkj8nby18RjFvr6aP7USRYAjTZW6yisnBWMX571Tw==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.3.tgz", + "integrity": "sha512-OWwz05d++TxzLEv4VnsTz5CmZ6mI6S05sfQGEMrNrQcOEERbX46332IvE7pO/EUiw7jUrrS40z/M7kPyjfl04g==", "cpu": [ "x64" ], @@ -1849,9 +1890,10 @@ } }, "node_modules/@next/env": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/env/-/env-15.3.3.tgz", - "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw==" + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz", + "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==", + "license": "MIT" }, "node_modules/@next/eslint-plugin-next": { "version": "15.5.2", @@ -1864,12 +1906,13 @@ } }, "node_modules/@next/swc-darwin-arm64": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.3.3.tgz", - "integrity": "sha512-WRJERLuH+O3oYB4yZNVahSVFmtxRNjNF1I1c34tYMoJb0Pve+7/RaLAJJizyYiFhjYNGHRAE1Ri2Fd23zgDqhg==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz", + "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==", "cpu": [ "arm64" ], + "license": "MIT", "optional": true, "os": [ "darwin" @@ -1879,12 +1922,13 @@ } }, "node_modules/@next/swc-darwin-x64": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.3.3.tgz", - "integrity": "sha512-XHdzH/yBc55lu78k/XwtuFR/ZXUTcflpRXcsu0nKmF45U96jt1tsOZhVrn5YH+paw66zOANpOnFQ9i6/j+UYvw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz", + "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==", "cpu": [ "x64" ], + "license": "MIT", "optional": true, "os": [ "darwin" @@ -1894,12 +1938,13 @@ } }, "node_modules/@next/swc-linux-arm64-gnu": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.3.3.tgz", - "integrity": "sha512-VZ3sYL2LXB8znNGcjhocikEkag/8xiLgnvQts41tq6i+wql63SMS1Q6N8RVXHw5pEUjiof+II3HkDd7GFcgkzw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz", + "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==", "cpu": [ "arm64" ], + "license": "MIT", "optional": true, "os": [ "linux" @@ -1909,12 +1954,13 @@ } }, "node_modules/@next/swc-linux-arm64-musl": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.3.3.tgz", - "integrity": "sha512-h6Y1fLU4RWAp1HPNJWDYBQ+e3G7sLckyBXhmH9ajn8l/RSMnhbuPBV/fXmy3muMcVwoJdHL+UtzRzs0nXOf9SA==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz", + "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==", "cpu": [ "arm64" ], + "license": "MIT", "optional": true, "os": [ "linux" @@ -1924,12 +1970,13 @@ } }, "node_modules/@next/swc-linux-x64-gnu": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.3.3.tgz", - "integrity": "sha512-jJ8HRiF3N8Zw6hGlytCj5BiHyG/K+fnTKVDEKvUCyiQ/0r5tgwO7OgaRiOjjRoIx2vwLR+Rz8hQoPrnmFbJdfw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz", + "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==", "cpu": [ "x64" ], + "license": "MIT", "optional": true, "os": [ "linux" @@ -1939,12 +1986,13 @@ } }, "node_modules/@next/swc-linux-x64-musl": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.3.3.tgz", - "integrity": "sha512-HrUcTr4N+RgiiGn3jjeT6Oo208UT/7BuTr7K0mdKRBtTbT4v9zJqCDKO97DUqqoBK1qyzP1RwvrWTvU6EPh/Cw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz", + "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==", "cpu": [ "x64" ], + "license": "MIT", "optional": true, "os": [ "linux" @@ -1954,12 +2002,13 @@ } }, "node_modules/@next/swc-win32-arm64-msvc": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.3.3.tgz", - "integrity": "sha512-SxorONgi6K7ZUysMtRF3mIeHC5aA3IQLmKFQzU0OuhuUYwpOBc1ypaLJLP5Bf3M9k53KUUUj4vTPwzGvl/NwlQ==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz", + "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==", "cpu": [ "arm64" ], + "license": "MIT", "optional": true, "os": [ "win32" @@ -1969,12 +2018,13 @@ } }, "node_modules/@next/swc-win32-x64-msvc": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.3.3.tgz", - "integrity": "sha512-4QZG6F8enl9/S2+yIiOiju0iCTFd93d8VC1q9LZS4p/Xuk81W2QDjCFeoogmrWWkAD59z8ZxepBQap2dKS5ruw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz", + "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==", "cpu": [ "x64" ], + "license": "MIT", "optional": true, "os": [ "win32" @@ -3547,12 +3597,6 @@ "@sinonjs/commons": "^3.0.0" } }, - "node_modules/@swc/counter": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz", - "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==", - "license": "Apache-2.0" - }, "node_modules/@swc/helpers": { "version": "0.5.15", "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz", @@ -5475,17 +5519,6 @@ "dev": true, "license": "MIT" }, - "node_modules/busboy": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz", - "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==", - "dependencies": { - "streamsearch": "^1.1.0" - }, - "engines": { - "node": ">=10.16.0" - } - }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -8295,9 +8328,9 @@ } }, "node_modules/is-arrayish": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", - "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==", + "version": "0.3.4", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz", + "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==", "license": "MIT", "optional": true }, @@ -11542,14 +11575,13 @@ } }, "node_modules/next": { - "version": "15.3.3", - "resolved": "https://registry.npmjs.org/next/-/next-15.3.3.tgz", - "integrity": "sha512-JqNj29hHNmCLtNvd090SyRbXJiivQ+58XjCcrC50Crb5g5u2zi7Y2YivbsEfzk6AtVI80akdOQbaMZwWB1Hthw==", + "version": "15.5.3", + "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz", + "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==", + "license": "MIT", "dependencies": { - "@next/env": "15.3.3", - "@swc/counter": "0.1.3", + "@next/env": "15.5.3", "@swc/helpers": "0.5.15", - "busboy": "1.6.0", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" @@ -11561,19 +11593,19 @@ "node": "^18.18.0 || ^19.8.0 || >= 20.0.0" }, "optionalDependencies": { - "@next/swc-darwin-arm64": "15.3.3", - "@next/swc-darwin-x64": "15.3.3", - "@next/swc-linux-arm64-gnu": "15.3.3", - "@next/swc-linux-arm64-musl": "15.3.3", - "@next/swc-linux-x64-gnu": "15.3.3", - "@next/swc-linux-x64-musl": "15.3.3", - "@next/swc-win32-arm64-msvc": "15.3.3", - "@next/swc-win32-x64-msvc": "15.3.3", - "sharp": "^0.34.1" + "@next/swc-darwin-arm64": "15.5.3", + "@next/swc-darwin-x64": "15.5.3", + "@next/swc-linux-arm64-gnu": "15.5.3", + "@next/swc-linux-arm64-musl": "15.5.3", + "@next/swc-linux-x64-gnu": "15.5.3", + "@next/swc-linux-x64-musl": "15.5.3", + "@next/swc-win32-arm64-msvc": "15.5.3", + "@next/swc-win32-x64-msvc": "15.5.3", + "sharp": "^0.34.3" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", - "@playwright/test": "^1.41.2", + "@playwright/test": "^1.51.1", "babel-plugin-react-compiler": "*", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", @@ -13240,16 +13272,16 @@ "license": "ISC" }, "node_modules/sharp": { - "version": "0.34.1", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.1.tgz", - "integrity": "sha512-1j0w61+eVxu7DawFJtnfYcvSv6qPFvfTaqzTQ2BLknVhHTwGS8sc63ZBF4rzkWMBVKybo4S5OBtDdZahh2A1xg==", + "version": "0.34.3", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz", + "integrity": "sha512-eX2IQ6nFohW4DbvHIOLRB3MHFpYqaqvXd3Tp5e/T/dSH83fxaNJQRvDMhASmkNTsNTVF2/OOopzRCt7xokgPfg==", "hasInstallScript": true, "license": "Apache-2.0", "optional": true, "dependencies": { "color": "^4.2.3", - "detect-libc": "^2.0.3", - "semver": "^7.7.1" + "detect-libc": "^2.0.4", + "semver": "^7.7.2" }, "engines": { "node": "^18.17.0 || ^20.3.0 || >=21.0.0" @@ -13258,26 +13290,28 @@ "url": "https://opencollective.com/libvips" }, "optionalDependencies": { - "@img/sharp-darwin-arm64": "0.34.1", - "@img/sharp-darwin-x64": "0.34.1", - "@img/sharp-libvips-darwin-arm64": "1.1.0", - "@img/sharp-libvips-darwin-x64": "1.1.0", - "@img/sharp-libvips-linux-arm": "1.1.0", - "@img/sharp-libvips-linux-arm64": "1.1.0", - "@img/sharp-libvips-linux-ppc64": "1.1.0", - "@img/sharp-libvips-linux-s390x": "1.1.0", - "@img/sharp-libvips-linux-x64": "1.1.0", - "@img/sharp-libvips-linuxmusl-arm64": "1.1.0", - "@img/sharp-libvips-linuxmusl-x64": "1.1.0", - "@img/sharp-linux-arm": "0.34.1", - "@img/sharp-linux-arm64": "0.34.1", - "@img/sharp-linux-s390x": "0.34.1", - "@img/sharp-linux-x64": "0.34.1", - "@img/sharp-linuxmusl-arm64": "0.34.1", - "@img/sharp-linuxmusl-x64": "0.34.1", - "@img/sharp-wasm32": "0.34.1", - "@img/sharp-win32-ia32": "0.34.1", - "@img/sharp-win32-x64": "0.34.1" + "@img/sharp-darwin-arm64": "0.34.3", + "@img/sharp-darwin-x64": "0.34.3", + "@img/sharp-libvips-darwin-arm64": "1.2.0", + "@img/sharp-libvips-darwin-x64": "1.2.0", + "@img/sharp-libvips-linux-arm": "1.2.0", + "@img/sharp-libvips-linux-arm64": "1.2.0", + "@img/sharp-libvips-linux-ppc64": "1.2.0", + "@img/sharp-libvips-linux-s390x": "1.2.0", + "@img/sharp-libvips-linux-x64": "1.2.0", + "@img/sharp-libvips-linuxmusl-arm64": "1.2.0", + "@img/sharp-libvips-linuxmusl-x64": "1.2.0", + "@img/sharp-linux-arm": "0.34.3", + "@img/sharp-linux-arm64": "0.34.3", + "@img/sharp-linux-ppc64": "0.34.3", + "@img/sharp-linux-s390x": "0.34.3", + "@img/sharp-linux-x64": "0.34.3", + "@img/sharp-linuxmusl-arm64": "0.34.3", + "@img/sharp-linuxmusl-x64": "0.34.3", + "@img/sharp-wasm32": "0.34.3", + "@img/sharp-win32-arm64": "0.34.3", + "@img/sharp-win32-ia32": "0.34.3", + "@img/sharp-win32-x64": "0.34.3" } }, "node_modules/shebang-command": { @@ -13403,9 +13437,9 @@ "license": "ISC" }, "node_modules/simple-swizzle": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", - "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz", + "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==", "license": "MIT", "optional": true, "dependencies": { @@ -13526,14 +13560,6 @@ "node": ">= 0.8" } }, - "node_modules/streamsearch": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", - "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==", - "engines": { - "node": ">=10.0.0" - } - }, "node_modules/string-length": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json index e50401fa6..a0a8b2c7c 100644 --- a/llama_stack/ui/package.json +++ b/llama_stack/ui/package.json @@ -25,7 +25,7 @@ "framer-motion": "^12.23.12", "llama-stack-client": "^0.2.21", "lucide-react": "^0.542.0", - "next": "15.3.3", + "next": "15.5.3", "next-auth": "^4.24.11", "next-themes": "^0.4.6", "react": "^19.0.0", From b6cb8178976b941a1fdb3894b00bd13eaca91561 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 15 Sep 2025 09:46:14 +0200 Subject: [PATCH 29/30] chore(ui-deps): bump @radix-ui/react-select from 2.2.5 to 2.2.6 in /llama_stack/ui (#3437) Bumps [@radix-ui/react-select](https://github.com/radix-ui/primitives) from 2.2.5 to 2.2.6.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@radix-ui/react-select&package-manager=npm_and_yarn&previous-version=2.2.5&new-version=2.2.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- llama_stack/ui/package-lock.json | 77 ++++++++++++++------------------ llama_stack/ui/package.json | 2 +- 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/llama_stack/ui/package-lock.json b/llama_stack/ui/package-lock.json index ff73fa2e8..f333aa809 100644 --- a/llama_stack/ui/package-lock.json +++ b/llama_stack/ui/package-lock.json @@ -11,7 +11,7 @@ "@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dropdown-menu": "^2.1.16", - "@radix-ui/react-select": "^2.2.5", + "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-tooltip": "^1.2.8", @@ -2924,22 +2924,22 @@ } }, "node_modules/@radix-ui/react-select": { - "version": "2.2.5", - "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz", - "integrity": "sha512-HnMTdXEVuuyzx63ME0ut4+sEMYW6oouHWNGUZc7ddvUWIcfCva/AMoqEW/3wnEllriMWBa0RHspCYnfCWJQYmA==", + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz", + "integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==", "license": "MIT", "dependencies": { "@radix-ui/number": "1.1.1", - "@radix-ui/primitive": "1.1.2", + "@radix-ui/primitive": "1.1.3", "@radix-ui/react-collection": "1.1.7", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-direction": "1.1.1", - "@radix-ui/react-dismissable-layer": "1.1.10", - "@radix-ui/react-focus-guards": "1.1.2", + "@radix-ui/react-dismissable-layer": "1.1.11", + "@radix-ui/react-focus-guards": "1.1.3", "@radix-ui/react-focus-scope": "1.1.7", "@radix-ui/react-id": "1.1.1", - "@radix-ui/react-popper": "1.2.7", + "@radix-ui/react-popper": "1.2.8", "@radix-ui/react-portal": "1.1.9", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-slot": "1.2.3", @@ -2966,13 +2966,19 @@ } } }, + "node_modules/@radix-ui/react-select/node_modules/@radix-ui/primitive": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz", + "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", + "license": "MIT" + }, "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": { - "version": "1.1.10", - "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz", - "integrity": "sha512-IM1zzRV4W3HtVgftdQiiOmA0AdJlCtMLe00FXaHwgt3rAnNsIyDqshvkIW3hj/iu5hu8ERP7KIYki6NkqDxAwQ==", + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz", + "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==", "license": "MIT", "dependencies": { - "@radix-ui/primitive": "1.1.2", + "@radix-ui/primitive": "1.1.3", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-use-callback-ref": "1.1.1", @@ -2993,6 +2999,21 @@ } } }, + "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-guards": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz", + "integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==", + "license": "MIT", + "peerDependencies": { + "@types/react": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz", @@ -3018,38 +3039,6 @@ } } }, - "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-popper": { - "version": "1.2.7", - "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.7.tgz", - "integrity": "sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==", - "license": "MIT", - "dependencies": { - "@floating-ui/react-dom": "^2.0.0", - "@radix-ui/react-arrow": "1.1.7", - "@radix-ui/react-compose-refs": "1.1.2", - "@radix-ui/react-context": "1.1.2", - "@radix-ui/react-primitive": "2.1.3", - "@radix-ui/react-use-callback-ref": "1.1.1", - "@radix-ui/react-use-layout-effect": "1.1.1", - "@radix-ui/react-use-rect": "1.1.1", - "@radix-ui/react-use-size": "1.1.1", - "@radix-ui/rect": "1.1.1" - }, - "peerDependencies": { - "@types/react": "*", - "@types/react-dom": "*", - "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", - "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" - }, - "peerDependenciesMeta": { - "@types/react": { - "optional": true - }, - "@types/react-dom": { - "optional": true - } - } - }, "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": { "version": "1.1.9", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", diff --git a/llama_stack/ui/package.json b/llama_stack/ui/package.json index a0a8b2c7c..ccbc2a4c2 100644 --- a/llama_stack/ui/package.json +++ b/llama_stack/ui/package.json @@ -16,7 +16,7 @@ "@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dropdown-menu": "^2.1.16", - "@radix-ui/react-select": "^2.2.5", + "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-tooltip": "^1.2.8", From 01bdcce4d2218754acfe960de58598bc50e32d21 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 15 Sep 2025 15:25:53 -0400 Subject: [PATCH 30/30] chore(recorder): update mocks to be closer to non-mock environment (#3442) # What does this PR do? the @required_args decorator in openai-python is masking the async nature of the {AsyncCompletions,chat.AsyncCompletions}.create method. see https://github.com/openai/openai-python/issues/996 this means two things - 0. we cannot use iscoroutine in the recorder to detect async vs non 1. our mocks are inappropriately introducing identifiable async for (0), we update the iscoroutine check w/ detection of /v1/models, which is the only non-async function we mock & record. for (1), we could leave everything as is and assume (0) will catch errors. to be defensive, we update the unit tests to mock below create methods, allowing the true openai-python create() methods to be tested. --- llama_stack/testing/inference_recorder.py | 14 +- .../distribution/test_inference_recordings.py | 208 +++++++++--------- 2 files changed, 113 insertions(+), 109 deletions(-) diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py index f899d73d3..674016fb1 100644 --- a/llama_stack/testing/inference_recorder.py +++ b/llama_stack/testing/inference_recorder.py @@ -7,7 +7,6 @@ from __future__ import annotations # for forward references import hashlib -import inspect import json import os from collections.abc import Generator @@ -243,11 +242,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint global _current_mode, _current_storage if _current_mode == InferenceMode.LIVE or _current_storage is None: - # Normal operation - if inspect.iscoroutinefunction(original_method): - return await original_method(self, *args, **kwargs) - else: + if endpoint == "/v1/models": return original_method(self, *args, **kwargs) + else: + return await original_method(self, *args, **kwargs) # Get base URL based on client type if client_type == "openai": @@ -298,10 +296,10 @@ async def _patched_inference_method(original_method, self, client_type, endpoint ) elif _current_mode == InferenceMode.RECORD: - if inspect.iscoroutinefunction(original_method): - response = await original_method(self, *args, **kwargs) - else: + if endpoint == "/v1/models": response = original_method(self, *args, **kwargs) + else: + response = await original_method(self, *args, **kwargs) # we want to store the result of the iterator, not the iterator itself if endpoint == "/v1/models": diff --git a/tests/unit/distribution/test_inference_recordings.py b/tests/unit/distribution/test_inference_recordings.py index 4909bbe1e..5740357c1 100644 --- a/tests/unit/distribution/test_inference_recordings.py +++ b/tests/unit/distribution/test_inference_recordings.py @@ -155,27 +155,22 @@ class TestInferenceRecording: async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response): """Test that recording mode captures and stores responses.""" - - async def mock_create(*args, **kwargs): - return real_openai_chat_response - temp_storage_dir = temp_storage_dir / "test_recording_mode" - with patch( - "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create - ): - with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response) - response = await client.chat.completions.create( - model="llama3.2:3b", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=0.7, - max_tokens=50, - user=NOT_GIVEN, - ) + response = await client.chat.completions.create( + model="llama3.2:3b", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=0.7, + max_tokens=50, + user=NOT_GIVEN, + ) - # Verify the response was returned correctly - assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." + # Verify the response was returned correctly + assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." + client.chat.completions._post.assert_called_once() # Verify recording was stored storage = ResponseStorage(temp_storage_dir) @@ -183,43 +178,38 @@ class TestInferenceRecording: async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response): """Test that replay mode returns stored responses without making real calls.""" - - async def mock_create(*args, **kwargs): - return real_openai_chat_response - temp_storage_dir = temp_storage_dir / "test_replay_mode" # First, record a response - with patch( - "openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create - ): - with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response) - response = await client.chat.completions.create( - model="llama3.2:3b", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=0.7, - max_tokens=50, - user=NOT_GIVEN, - ) + response = await client.chat.completions.create( + model="llama3.2:3b", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=0.7, + max_tokens=50, + user=NOT_GIVEN, + ) + client.chat.completions._post.assert_called_once() # Now test replay mode - should not call the original method - with patch("openai.resources.chat.completions.AsyncCompletions.create") as mock_create_patch: - with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response) - response = await client.chat.completions.create( - model="llama3.2:3b", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=0.7, - max_tokens=50, - ) + response = await client.chat.completions.create( + model="llama3.2:3b", + messages=[{"role": "user", "content": "Hello, how are you?"}], + temperature=0.7, + max_tokens=50, + ) - # Verify we got the recorded response - assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." + # Verify we got the recorded response + assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." - # Verify the original method was NOT called - mock_create_patch.assert_not_called() + # Verify the original method was NOT called + client.chat.completions._post.assert_not_called() async def test_replay_mode_models(self, temp_storage_dir): """Test that replay mode returns stored responses without making real model listing calls.""" @@ -272,43 +262,50 @@ class TestInferenceRecording: async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response): """Test recording and replay of embeddings calls.""" - async def mock_create(*args, **kwargs): - return real_embeddings_response + # baseline - mock works without recording + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.embeddings._post = AsyncMock(return_value=real_embeddings_response) + response = await client.embeddings.create( + model=real_embeddings_response.model, + input=["Hello world", "Test embedding"], + encoding_format=NOT_GIVEN, + ) + assert len(response.data) == 2 + assert response.data[0].embedding == [0.1, 0.2, 0.3] + client.embeddings._post.assert_called_once() temp_storage_dir = temp_storage_dir / "test_embeddings_recording" # Record - with patch( - "openai.resources.embeddings.AsyncEmbeddings.create", new_callable=AsyncMock, side_effect=mock_create - ): - with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.embeddings._post = AsyncMock(return_value=real_embeddings_response) - response = await client.embeddings.create( - model=real_embeddings_response.model, - input=["Hello world", "Test embedding"], - encoding_format=NOT_GIVEN, - dimensions=NOT_GIVEN, - user=NOT_GIVEN, - ) + response = await client.embeddings.create( + model=real_embeddings_response.model, + input=["Hello world", "Test embedding"], + encoding_format=NOT_GIVEN, + dimensions=NOT_GIVEN, + user=NOT_GIVEN, + ) - assert len(response.data) == 2 + assert len(response.data) == 2 # Replay - with patch("openai.resources.embeddings.AsyncEmbeddings.create") as mock_create_patch: - with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.embeddings._post = AsyncMock(return_value=real_embeddings_response) - response = await client.embeddings.create( - model=real_embeddings_response.model, - input=["Hello world", "Test embedding"], - ) + response = await client.embeddings.create( + model=real_embeddings_response.model, + input=["Hello world", "Test embedding"], + ) - # Verify we got the recorded response - assert len(response.data) == 2 - assert response.data[0].embedding == [0.1, 0.2, 0.3] + # Verify we got the recorded response + assert len(response.data) == 2 + assert response.data[0].embedding == [0.1, 0.2, 0.3] - # Verify original method was not called - mock_create_patch.assert_not_called() + # Verify original method was not called + client.embeddings._post.assert_not_called() async def test_completions_recording(self, temp_storage_dir): real_completions_response = OpenAICompletion( @@ -326,40 +323,49 @@ class TestInferenceRecording: ], ) - async def mock_create(*args, **kwargs): - return real_completions_response - temp_storage_dir = temp_storage_dir / "test_completions_recording" + # baseline - mock works without recording + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.completions._post = AsyncMock(return_value=real_completions_response) + response = await client.completions.create( + model=real_completions_response.model, + prompt="Hello, how are you?", + temperature=0.7, + max_tokens=50, + user=NOT_GIVEN, + ) + assert response.choices[0].text == real_completions_response.choices[0].text + client.completions._post.assert_called_once() + # Record - with patch( - "openai.resources.completions.AsyncCompletions.create", new_callable=AsyncMock, side_effect=mock_create - ): - with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.completions._post = AsyncMock(return_value=real_completions_response) - response = await client.completions.create( - model=real_completions_response.model, - prompt="Hello, how are you?", - temperature=0.7, - max_tokens=50, - user=NOT_GIVEN, - ) + response = await client.completions.create( + model=real_completions_response.model, + prompt="Hello, how are you?", + temperature=0.7, + max_tokens=50, + user=NOT_GIVEN, + ) - assert response.choices[0].text == real_completions_response.choices[0].text + assert response.choices[0].text == real_completions_response.choices[0].text + client.completions._post.assert_called_once() # Replay - with patch("openai.resources.completions.AsyncCompletions.create") as mock_create_patch: - with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): - client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") - response = await client.completions.create( - model=real_completions_response.model, - prompt="Hello, how are you?", - temperature=0.7, - max_tokens=50, - ) - assert response.choices[0].text == real_completions_response.choices[0].text - mock_create_patch.assert_not_called() + with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): + client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") + client.completions._post = AsyncMock(return_value=real_completions_response) + response = await client.completions.create( + model=real_completions_response.model, + prompt="Hello, how are you?", + temperature=0.7, + max_tokens=50, + ) + assert response.choices[0].text == real_completions_response.choices[0].text + client.completions._post.assert_not_called() async def test_live_mode(self, real_openai_chat_response): """Test that live mode passes through to original methods."""