From 8b46d5966b9bc9d29cf8cac01ddb137107e85a26 Mon Sep 17 00:00:00 2001
From: Emilio Garcia <i.am.emilio@gmail.com>
Date: Tue, 11 Nov 2025 14:34:46 -0500
Subject: [PATCH] fix(telemetry): remove unnessary calls to legacy tracing
 middleware

---
 src/llama_stack/core/utils/context.py              | 14 ++++----------
 .../providers/remote/inference/bedrock/bedrock.py  |  3 +--
 .../providers/remote/inference/watsonx/watsonx.py  |  3 +--
 .../utils/inference/litellm_openai_mixin.py        |  3 +--
 4 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/llama_stack/core/utils/context.py b/src/llama_stack/core/utils/context.py
index e7c61a8ed..0c3e41f00 100644
--- a/src/llama_stack/core/utils/context.py
+++ b/src/llama_stack/core/utils/context.py
@@ -7,8 +7,6 @@
 from collections.abc import AsyncGenerator
 from contextvars import ContextVar
 
-from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT
-
 _MISSING = object()
 
 
@@ -69,16 +67,12 @@ def preserve_contexts_async_generator[T](
             try:
                 yield item
                 # Update our tracked values with any changes made during this iteration
-                # Only for non-trace context vars - trace context must persist across yields
-                # to allow nested span tracking for telemetry
+                # This allows context changes to persist across generator iterations
                 for context_var in context_vars:
-                    if context_var is not CURRENT_TRACE_CONTEXT:
-                        initial_context_values[context_var.name] = context_var.get()
+                    initial_context_values[context_var.name] = context_var.get()
             finally:
-                # Restore non-trace context vars after each yield to prevent leaks between requests
-                # CURRENT_TRACE_CONTEXT is NOT restored here to preserve telemetry span stack
+                # Restore context vars after each yield to prevent leaks between requests
                 for context_var in context_vars:
-                    if context_var is not CURRENT_TRACE_CONTEXT:
-                        _restore_context_var(context_var)
+                    _restore_context_var(context_var)
 
     return wrapper()
diff --git a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
index 70ee95916..17d1aca77 100644
--- a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@@ -8,7 +8,6 @@ from collections.abc import AsyncIterator, Iterable
 
 from openai import AuthenticationError
 
-from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack_api import (
@@ -84,7 +83,7 @@ class BedrockInferenceAdapter(OpenAIMixin):
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         """Override to enable streaming usage metrics and handle authentication errors."""
         # Enable streaming usage metrics when telemetry is active
-        if params.stream and get_current_span() is not None:
+        if params.stream:
             if params.stream_options is None:
                 params.stream_options = {"include_usage": True}
             elif "include_usage" not in params.stream_options:
diff --git a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
index aab9e2dca..8d204d72a 100644
--- a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -10,7 +10,6 @@ from typing import Any
 import litellm
 import requests
 
-from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@@ -59,7 +58,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
 
         # Add usage tracking for streaming when telemetry is active
         stream_options = params.stream_options
-        if params.stream and get_current_span() is not None:
+        if params.stream:
             if stream_options is None:
                 stream_options = {"include_usage": True}
             elif "include_usage" not in stream_options:
diff --git a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index c462d1aad..47c68ff0a 100644
--- a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -217,10 +217,9 @@ class LiteLLMOpenAIMixin(
         params: OpenAIChatCompletionRequestWithExtraBody,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         # Add usage tracking for streaming when telemetry is active
-        from llama_stack.core.telemetry.tracing import get_current_span
 
         stream_options = params.stream_options
-        if params.stream and get_current_span() is not None:
+        if params.stream:
             if stream_options is None:
                 stream_options = {"include_usage": True}
             elif "include_usage" not in stream_options: