fix(telemetry): remove unnessary calls to legacy tracing middleware

This commit is contained in:
Emilio Garcia 2025-11-11 14:34:46 -05:00
parent 503522716f
commit 9e2b92b8d2
4 changed files with 7 additions and 16 deletions

View file

@ -7,8 +7,6 @@
from collections.abc import AsyncGenerator
from contextvars import ContextVar
from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT
_MISSING = object()
@ -69,16 +67,12 @@ def preserve_contexts_async_generator[T](
try:
yield item
# Update our tracked values with any changes made during this iteration
# Only for non-trace context vars - trace context must persist across yields
# to allow nested span tracking for telemetry
# This allows context changes to persist across generator iterations
for context_var in context_vars:
if context_var is not CURRENT_TRACE_CONTEXT:
initial_context_values[context_var.name] = context_var.get()
finally:
# Restore non-trace context vars after each yield to prevent leaks between requests
# CURRENT_TRACE_CONTEXT is NOT restored here to preserve telemetry span stack
# Restore context vars after each yield to prevent leaks between requests
for context_var in context_vars:
if context_var is not CURRENT_TRACE_CONTEXT:
_restore_context_var(context_var)
return wrapper()

View file

@ -8,7 +8,6 @@ from collections.abc import AsyncIterator, Iterable
from openai import AuthenticationError
from llama_stack.core.telemetry.tracing import get_current_span
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack_api import (
@ -84,7 +83,7 @@ class BedrockInferenceAdapter(OpenAIMixin):
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
"""Override to enable streaming usage metrics and handle authentication errors."""
# Enable streaming usage metrics when telemetry is active
if params.stream and get_current_span() is not None:
if params.stream:
if params.stream_options is None:
params.stream_options = {"include_usage": True}
elif "include_usage" not in params.stream_options:

View file

@ -10,7 +10,6 @@ from typing import Any
import litellm
import requests
from llama_stack.core.telemetry.tracing import get_current_span
from llama_stack.log import get_logger
from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@ -59,7 +58,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
# Add usage tracking for streaming when telemetry is active
stream_options = params.stream_options
if params.stream and get_current_span() is not None:
if params.stream:
if stream_options is None:
stream_options = {"include_usage": True}
elif "include_usage" not in stream_options:

View file

@ -217,10 +217,9 @@ class LiteLLMOpenAIMixin(
params: OpenAIChatCompletionRequestWithExtraBody,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
# Add usage tracking for streaming when telemetry is active
from llama_stack.core.telemetry.tracing import get_current_span
stream_options = params.stream_options
if params.stream and get_current_span() is not None:
if params.stream:
if stream_options is None:
stream_options = {"include_usage": True}
elif "include_usage" not in stream_options: