feat: enable streaming usage metrics for OpenAI-compatible providers (#4326)

Inject `stream_options={"include_usage": True} `when streaming and
OpenTelemetry telemetry is active. Telemetry always overrides any caller
preference to ensure complete and consistent observability metrics.

Changes:
- Add conditional stream_options injection to OpenAIMixin (benefits
OpenAI, Bedrock, Runpod, Together, Fireworks providers)
- Add conditional stream_options injection to LiteLLMOpenAIMixin
(benefits WatsonX and other litellm-based providers)
- Check telemetry status using trace.get_current_span().is_recording()
- Override include_usage=False when telemetry active to prevent metric
gaps
- Unit tests for this functionality

Fixes #3981

Note: this work originated in PR #4200, which I closed after rebasing on
the telemetry changes. This PR rebases those commits, incorporates the
Bedrock feedback, and carries forward the same scope described there.
## Test Plan
#### OpenAIMixin + telemetry injection tests 
PYTHONPATH=src python -m pytest
tests/unit/providers/utils/inference/test_openai_mixin.py

#### LiteLLM OpenAIMixin tests
PYTHONPATH=src python -m pytest
tests/unit/providers/inference/test_litellm_openai_mixin.py -v

#### Broader inference provider
PYTHONPATH=src python -m pytest tests/unit/providers/inference/
--ignore=tests/unit/providers/inference/test_inference_client_caching.py
-v
This commit is contained in:
Sumanth Kamenani 2025-12-19 18:53:53 -05:00 committed by GitHub
parent 5ebcde3042
commit bd35aa4d78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 558 additions and 130 deletions

View file

@ -0,0 +1,68 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from unittest.mock import MagicMock, patch
from llama_stack.providers.utils.inference.openai_compat import (
get_stream_options_for_telemetry,
)
class TestGetStreamOptionsForTelemetry:
def test_returns_original_when_not_streaming(self):
stream_options = {"keep": True}
result = get_stream_options_for_telemetry(stream_options, False)
assert result is stream_options
def test_streaming_without_active_span_returns_original(self):
stream_options = {"keep": True}
with patch("opentelemetry.trace.get_current_span", return_value=None):
result = get_stream_options_for_telemetry(stream_options, True)
assert result is stream_options
def test_streaming_with_inactive_span_returns_original(self):
stream_options = {"keep": True}
mock_span = MagicMock()
mock_span.is_recording.return_value = False
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
result = get_stream_options_for_telemetry(stream_options, True)
assert result is stream_options
def test_streaming_with_active_span_adds_usage_when_missing(self):
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
result = get_stream_options_for_telemetry(None, True)
assert result == {"include_usage": True}
def test_streaming_with_active_span_merges_existing_options(self):
stream_options = {"other_option": "value"}
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
result = get_stream_options_for_telemetry(stream_options, True)
assert result == {"other_option": "value", "include_usage": True}
assert stream_options == {"other_option": "value"}
def test_streaming_with_active_span_overrides_include_usage_false(self):
stream_options = {"include_usage": False}
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
result = get_stream_options_for_telemetry(stream_options, True)
assert result["include_usage"] is True

View file

@ -934,3 +934,214 @@ class TestOpenAIMixinAllowedModelsInference:
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")]
)
)
class TestOpenAIMixinStreamOptionsInjection:
"""Test cases for automatic stream_options injection when telemetry is active"""
async def test_chat_completion_injects_stream_options_when_telemetry_active(self, mixin, mock_client_context):
"""Test that stream_options is injected for streaming chat completion when telemetry is active"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
# Mock OpenTelemetry span as recording
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")], stream=True
)
)
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs["stream_options"] == {"include_usage": True}
async def test_chat_completion_preserves_existing_stream_options(self, mixin, mock_client_context):
"""Test that existing stream_options are preserved with include_usage added"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4",
messages=[OpenAIUserMessageParam(role="user", content="Hello")],
stream=True,
stream_options={"other_option": True},
)
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs["stream_options"] == {"other_option": True, "include_usage": True}
async def test_chat_completion_no_injection_when_telemetry_inactive(self, mixin, mock_client_context):
"""Test that stream_options is NOT injected when telemetry is inactive"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
# Mock OpenTelemetry span as not recording
mock_span = MagicMock()
mock_span.is_recording.return_value = False
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")], stream=True
)
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert "stream_options" not in call_kwargs or call_kwargs["stream_options"] is None
async def test_chat_completion_no_injection_when_not_streaming(self, mixin, mock_client_context):
"""Test that stream_options is NOT injected for non-streaming requests"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")], stream=False
)
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert "stream_options" not in call_kwargs or call_kwargs["stream_options"] is None
async def test_completion_injects_stream_options_when_telemetry_active(self, mixin, mock_client_context):
"""Test that stream_options is injected for streaming completion when telemetry is active"""
mock_client = MagicMock()
mock_client.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_completion(
OpenAICompletionRequestWithExtraBody(model="text-davinci-003", prompt="Hello", stream=True)
)
mock_client.completions.create.assert_called_once()
call_kwargs = mock_client.completions.create.call_args[1]
assert call_kwargs["stream_options"] == {"include_usage": True}
async def test_completion_no_injection_when_telemetry_inactive(self, mixin, mock_client_context):
"""Test that stream_options is NOT injected for completion when telemetry is inactive"""
mock_client = MagicMock()
mock_client.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = False
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_completion(
OpenAICompletionRequestWithExtraBody(model="text-davinci-003", prompt="Hello", stream=True)
)
call_kwargs = mock_client.completions.create.call_args[1]
assert "stream_options" not in call_kwargs or call_kwargs["stream_options"] is None
async def test_params_not_mutated(self, mixin, mock_client_context):
"""Test that original params object is not mutated when stream_options is injected"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = True
original_params = OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")], stream=True
)
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(original_params)
# Original params should not be modified
assert original_params.stream_options is None
async def test_chat_completion_overrides_include_usage_false(self, mixin, mock_client_context):
"""Test that include_usage=False is overridden when telemetry is active"""
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4",
messages=[OpenAIUserMessageParam(role="user", content="Hello")],
stream=True,
stream_options={"include_usage": False},
)
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
# Telemetry must override False to ensure complete metrics
assert call_kwargs["stream_options"]["include_usage"] is True
async def test_no_injection_when_provider_doesnt_support_stream_options(self, mixin, mock_client_context):
"""Test that stream_options is NOT injected when provider doesn't support it"""
# Set supports_stream_options to False (like Ollama/vLLM)
mixin.supports_stream_options = False
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
# Mock OpenTelemetry span as recording (telemetry is active)
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_chat_completion(
OpenAIChatCompletionRequestWithExtraBody(
model="gpt-4", messages=[OpenAIUserMessageParam(role="user", content="Hello")], stream=True
)
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
# Should NOT inject stream_options even though telemetry is active
assert "stream_options" not in call_kwargs or call_kwargs["stream_options"] is None
async def test_completion_no_injection_when_provider_doesnt_support_stream_options(
self, mixin, mock_client_context
):
"""Test that stream_options is NOT injected for completion when provider doesn't support it"""
# Set supports_stream_options to False (like Ollama/vLLM)
mixin.supports_stream_options = False
mock_client = MagicMock()
mock_client.completions.create = AsyncMock(return_value=MagicMock())
# Mock OpenTelemetry span as recording (telemetry is active)
mock_span = MagicMock()
mock_span.is_recording.return_value = True
with mock_client_context(mixin, mock_client):
with patch("opentelemetry.trace.get_current_span", return_value=mock_span):
await mixin.openai_completion(
OpenAICompletionRequestWithExtraBody(model="text-davinci-003", prompt="Hello", stream=True)
)
call_kwargs = mock_client.completions.create.call_args[1]
# Should NOT inject stream_options even though telemetry is active
assert "stream_options" not in call_kwargs or call_kwargs["stream_options"] is None