feat: remove core.telemetry as a dependency of llama_stack.apis (#4064)

# What does this PR do? Remove circular dependency by moving tracing from API protocol definitions to router implementation layer. This gets us closer to having a self contained API package with no other cross-cutting dependencies to other parts of the llama stack codebase. To the best of our ability, the llama_stack.api should only be type and protocol definitions. Changes: - Create apis/common/tracing.py with marker decorator (zero core dependencies) - Add the _new_ `@telemetry_traceable` marker decorator to 11 protocol classes - Apply actual tracing in core/resolver.py in `instantiate_provider` based on protocol marker - Move MetricResponseMixin from core to apis (it's an API response type) - APIs package is now self-contained with zero core dependencies The tracing functionality remains identical - actual trace_protocol from core is applied to router implementations at runtime when both telemetry is enabled and the protocol has the `__marked_for_tracing__` marker. ## Test Plan Manual integration test confirms identical behavior to main branch: ```bash llama stack list-deps --format uv starter | sh export OLLAMA_URL=http://localhost:11434 llama stack run starter curl -X POST http://localhost:8321/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model": "ollama/gpt-oss:20b", "messages": [{"role": "user", "content": "Say hello"}], "max_tokens": 10}' ``` Verified identical between main and this branch: - trace_id present in response - metrics array with prompt_tokens, completion_tokens, total_tokens - Server logs show trace_protocol applied to all routers Existing telemetry integration tests (tests/integration/telemetry/) validate trace context propagation and span attributes. relates to #3895 --------- Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-11-06 13:58:30 -05:00 · 2025-11-06 13:58:30 -05:00 · 9df073450f
commit 9df073450f
parent dc9497a3b2
15 changed files with 106 additions and 62 deletions
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -34,3 +34,44 @@ class PaginatedResponse(BaseModel):
    data: list[dict[str, Any]]
    has_more: bool
    url: str | None = None
+
+
+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be included with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+@json_schema_type
+class MetricInResponse(BaseModel):
+    """A metric value included in API responses.
+    :param metric: The name of the metric
+    :param value: The numeric value of the metric
+    :param unit: (Optional) The unit of measurement for the metric value
+    """
+
+    metric: str
+    value: int | float
+    unit: str | None = None
+
+
+class MetricResponseMixin(BaseModel):
+    """Mixin class for API responses that can include metrics.
+    :param metrics: (Optional) List of metrics associated with the API response
+    """
+
+    metrics: list[MetricInResponse] | None = None
--- a/src/llama_stack/apis/common/tracing.py
+++ b/src/llama_stack/apis/common/tracing.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+def telemetry_traceable(cls):
+    """
+    Mark a protocol for automatic tracing when telemetry is enabled.
+
+    This is a metadata-only decorator with no dependencies on core.
+    Actual tracing is applied by core routers at runtime if telemetry is enabled.
+
+    Usage:
+        @runtime_checkable
+        @telemetry_traceable
+        class MyProtocol(Protocol):
+            ...
+    """
+    cls.__marked_for_tracing__ = True
+    return cls
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -20,8 +20,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 Metadata = dict[str, str]
@ -157,7 +157,7 @@ class ConversationItemDeletedResource(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Conversations(Protocol):
    """Conversations

--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -11,8 +11,8 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -102,7 +102,7 @@ class OpenAIFileDeleteResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Files(Protocol):
    """Files

--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -19,11 +19,10 @@ from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict

 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.responses import MetricResponseMixin, Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.models import Model
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.core.telemetry.telemetry import MetricResponseMixin
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -1160,7 +1159,7 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class InferenceProvider(Protocol):
    """
    This protocol defines the interface that should be implemented by all inference providers.
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -9,9 +9,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel, ConfigDict, Field, field_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -105,7 +105,7 @@ class OpenAIListModelsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Models(Protocol):
    async def list_models(self) -> ListModelsResponse:
        """List all models.
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -10,8 +10,8 @@ from typing import Protocol, runtime_checkable

 from pydantic import BaseModel, Field, field_validator, model_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -92,7 +92,7 @@ class ListPromptsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Prompts(Protocol):
    """Prompts

--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -9,10 +9,10 @@ from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -94,7 +94,7 @@ class ShieldStore(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Safety(Protocol):
    """Safety

--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -8,9 +8,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -48,7 +48,7 @@ class ListShieldsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Shields(Protocol):
    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -11,9 +11,9 @@ from pydantic import BaseModel
 from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -107,7 +107,7 @@ class ListToolDefsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolGroups(Protocol):
    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
    async def register_tool_group(
@ -189,7 +189,7 @@ class SpecialToolGroup(Enum):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolRuntime(Protocol):
    tool_store: ToolStore | None = None

--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -13,10 +13,10 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body
 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -502,7 +502,7 @@ class VectorStoreTable(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class VectorIO(Protocol):
    vector_store_table: VectorStoreTable | None = None