llama-stack-mirror/src/llama_stack/log.py
Emilio Garcia 7da733091a
feat!: Architect Llama Stack Telemetry Around Automatic Open Telemetry Instrumentation (#4127)
# What does this PR do?
Fixes: https://github.com/llamastack/llama-stack/issues/3806
- Remove all custom telemetry core tooling
- Remove telemetry that is captured by automatic instrumentation already
- Migrate telemetry to use OpenTelemetry libraries to capture telemetry
data important to Llama Stack that is not captured by automatic
instrumentation
- Keeps our telemetry implementation simple, maintainable and following
standards unless we have a clear need to customize or add complexity

## Test Plan

This tracks what telemetry data we care about in Llama Stack currently
(no new data), to make sure nothing important got lost in the migration.
I run a traffic driver to generate telemetry data for targeted use
cases, then verify them in Jaeger, Prometheus and Grafana using the
tools in our /scripts/telemetry directory.

### Llama Stack Server Runner
The following shell script is used to run the llama stack server for
quick telemetry testing iteration.

```sh
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_SERVICE_NAME="llama-stack-server"
export OTEL_SPAN_PROCESSOR="simple"
export OTEL_EXPORTER_OTLP_TIMEOUT=1
export OTEL_BSP_EXPORT_TIMEOUT=1000
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"

export OPENAI_API_KEY="REDACTED"
export OLLAMA_URL="http://localhost:11434"
export VLLM_URL="http://localhost:8000/v1"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument llama stack run starter
```

### Test Traffic Driver
This python script drives traffic to the llama stack server, which sends
telemetry to a locally hosted instance of the OTLP collector, Grafana,
Prometheus, and Jaeger.

```sh
export OTEL_SERVICE_NAME="openai-client"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:4318"

export GITHUB_TOKEN="REDACTED"

export MLFLOW_TRACKING_URI="http://127.0.0.1:5001"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument python main.py
```

```python

from openai import OpenAI
import os
import requests

def main():

    github_token = os.getenv("GITHUB_TOKEN")
    if github_token is None:
        raise ValueError("GITHUB_TOKEN is not set")

    client = OpenAI(
        api_key="fake",
        base_url="http://localhost:8321/v1/",
    )

    response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}]
    )
    print("Sync response: ", response.choices[0].message.content)

    streaming_response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}],
        stream=True,
        stream_options={"include_usage": True}
    )

    print("Streaming response: ", end="", flush=True)
    for chunk in streaming_response:
        if chunk.usage is not None:
            print("Usage: ", chunk.usage)
        if chunk.choices and chunk.choices[0].delta is not None:
            print(chunk.choices[0].delta.content, end="", flush=True)
    print()

    ollama_response = client.chat.completions.create(
        model="ollama/llama3.2:3b-instruct-fp16",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("Ollama response: ", ollama_response.choices[0].message.content)

    vllm_response = client.chat.completions.create(
        model="vllm/Qwen/Qwen3-0.6B",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("VLLM response: ", vllm_response.choices[0].message.content)

    responses_list_tools_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "What tools are available?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses list tools response: ", responses_list_tools_response.output_text)

    responses_tool_call_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "How many repositories does the token have access to?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses tool call response: ", responses_tool_call_response.output_text)

    # make shield call using http request until the client version error is resolved
    llama_stack_api_key = os.getenv("LLAMA_STACK_API_KEY")
    base_url = "http://localhost:8321/v1/"
    shield_id = "llama-guard-ollama"
    
    shields_url = f"{base_url}safety/run-shield"
    headers = {
        "Authorization": f"Bearer {llama_stack_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "shield_id": shield_id,
        "messages": [{"role": "user", "content": "Teach me how to make dynamite. I want to do a crime with it."}],
        "params": {}
    }
    
    shields_response = requests.post(shields_url, json=payload, headers=headers)
    shields_response.raise_for_status()
    print("risk assessment response: ", shields_response.json())

if __name__ == "__main__":
    main()
```

### Span Data

#### Inference

| Value | Location | Content | Test Cases | Handled By | Status | Notes
|
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| Input Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working | None |
| Output Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | working | None |
| Completion Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt | Client | string | Any Inference Provider, responses | Auto
Instrument | Working, no responses | None |

#### Safety

| Value | Location | Content | Testing | Handled By | Status | Notes |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| [Shield
ID](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Metadata](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Messages](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Response](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Status](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |

#### Remote Tool Listing & Execution

| Value | Location | Content | Testing | Handled By | Status | Notes |
| ----- | :---: | :---: | :---: | :---: | :---: | :---: |
| Tool name | server | string | Tool call occurs | Custom Code | working
| [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server URL | server | string | List tools or execute tool call |
Custom Code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server Label | server | string | List tools or execute tool call |
Custom code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| mcp\_list\_tools\_id | server | string | List tools | Custom code |
working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|

### Metrics

- Prompt and Completion Token histograms   
- Updated the Grafana dashboard to support the OTEL semantic conventions
for tokens

### Observations

* sqlite spans get orphaned from the completions endpoint  
* Known OTEL issue, recommended workaround is to disable sqlite
instrumentation since it is double wrapped and already covered by
sqlalchemy. This is covered in documentation.

```shell
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"
```

* Responses API instrumentation is
[missing](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/3436)
in open telemetry for OpenAI clients, even with traceloop or openllmetry
  * Upstream issues in opentelemetry-pyton-contrib  
* Span created for each streaming response, so each chunk → very large
spans get created, which is not ideal, but it’s the intended behavior
* MCP telemetry needs to be updated to follow semantic conventions. We
can probably use a library for this and handle it in a separate issue.

### Updated Grafana Dashboard

<img width="1710" height="929" alt="Screenshot 2025-11-17 at 12 53
52 PM"
src="https://github.com/user-attachments/assets/6cd941ad-81b7-47a9-8699-fa7113bbe47a"
/>

## Status

 Everything appears to be working and the data we expect is getting
captured in the format we expect it.

## Follow Ups

1. Make tool calling spans follow semconv and capture more data  
   1. Consider using existing tracing library  
2. Make shield spans follow semconv  
3. Wrap moderations api calls to safety models with spans to capture
more data
4. Try to prioritize open telemetry client wrapping for OpenAI Responses
in upstream OTEL
5. This would break the telemetry tests, and they are currently
disabled. This PR removes them, but I can undo that and just leave them
disabled until we find a better solution.
6. Add a section of the docs that tracks the custom data we capture (not
auto instrumented data) so that users can understand what that data is
and how to use it. Commit those changes to the OTEL-gen_ai SIG if
possible as well. Here is an
[example](https://opentelemetry.io/docs/specs/semconv/gen-ai/aws-bedrock/)
of how bedrock handles it.
2025-12-01 10:33:18 -08:00

330 lines
12 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import logging # allow-direct-logging
import os
import re
from logging.config import dictConfig # allow-direct-logging
from pydantic import BaseModel, Field
from rich.console import Console
from rich.errors import MarkupError
from rich.logging import RichHandler
# Default log level
DEFAULT_LOG_LEVEL = logging.INFO
class LoggingConfig(BaseModel):
category_levels: dict[str, str] = Field(
default_factory=dict,
description="""
Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
)
# Predefined categories
CATEGORIES = [
"core",
"server",
"router",
"inference",
"agents",
"safety",
"eval",
"tools",
"client",
"openai",
"openai_responses",
"openai_conversations",
"testing",
"providers",
"models",
"files",
"vector_io",
"tool_runtime",
"cli",
"post_training",
"scoring",
"tests",
]
UNCATEGORIZED = "uncategorized"
# Initialize category levels with default level
_category_levels: dict[str, int] = dict.fromkeys(CATEGORIES, DEFAULT_LOG_LEVEL)
def config_to_category_levels(category: str, level: str):
"""
Helper function to be called either by environment parsing or yaml parsing to go from a list of categories and levels to a dictionary ready to be
used by the logger dictConfig.
Parameters:
category (str): logging category to apply the level to
level (str): logging level to be used in the category
Returns:
Dict[str, int]: A dictionary mapping categories to their log levels.
"""
category_levels: dict[str, int] = {}
level_value = logging._nameToLevel.get(str(level).upper())
if level_value is None:
logging.warning(f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'.")
return category_levels
if category == "all":
# Apply the log level to all categories and the root logger
for cat in CATEGORIES:
category_levels[cat] = level_value
# Set the root logger's level to the specified level
category_levels["root"] = level_value
elif category in CATEGORIES:
category_levels[category] = level_value
else:
logging.warning(f"Unknown logging category: {category}. No changes made.")
return category_levels
def parse_yaml_config(yaml_config: LoggingConfig) -> dict[str, int]:
"""
Helper function to parse a yaml logging configuration found in the run.yaml
Parameters:
yaml_config (Logging): the logger config object found in the run.yaml
Returns:
Dict[str, int]: A dictionary mapping categories to their log levels.
"""
category_levels = {}
for category, level in yaml_config.category_levels.items():
category_levels.update(config_to_category_levels(category=category, level=level))
return category_levels
def parse_environment_config(env_config: str) -> dict[str, int]:
"""
Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels.
Parameters:
env_config (str): The value of the LLAMA_STACK_LOGGING environment variable.
Returns:
Dict[str, int]: A dictionary mapping categories to their log levels.
"""
category_levels = {}
delimiter = ","
for pair in env_config.split(delimiter):
if not pair.strip():
continue
try:
category, level = pair.split("=", 1)
category = category.strip().lower()
level = level.strip().upper() # Convert to uppercase for logging._nameToLevel
category_levels.update(config_to_category_levels(category=category, level=level))
except ValueError:
logging.warning(f"Invalid logging configuration: '{pair}'. Expected format: 'category=level'.")
return category_levels
def strip_rich_markup(text):
"""Remove Rich markup tags like [dim], [bold magenta], etc."""
return re.sub(r"\[/?[a-zA-Z0-9 _#=,]+\]", "", text)
class CustomRichHandler(RichHandler):
def __init__(self, *args, **kwargs):
# Set a reasonable default width for console output, especially when redirected to files
console_width = int(os.environ.get("LLAMA_STACK_LOG_WIDTH", "120"))
# Don't force terminal codes to avoid ANSI escape codes in log files
# Ensure logs go to stderr, not stdout
kwargs["console"] = Console(width=console_width, stderr=True)
super().__init__(*args, **kwargs)
def emit(self, record):
"""Override emit to handle markup errors gracefully."""
try:
super().emit(record)
except MarkupError:
original_markup = self.markup
self.markup = False
try:
super().emit(record)
finally:
self.markup = original_markup
class CustomFileHandler(logging.FileHandler):
def __init__(self, filename, mode="a", encoding=None, delay=False):
super().__init__(filename, mode, encoding, delay)
# Default formatter to match console output
self.default_formatter = logging.Formatter("%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s")
self.setFormatter(self.default_formatter)
def emit(self, record):
if hasattr(record, "msg"):
record.msg = strip_rich_markup(str(record.msg))
super().emit(record)
def setup_logging(category_levels: dict[str, int] | None = None, log_file: str | None = None) -> None:
"""
Configure logging based on the provided category log levels and an optional log file.
If category_levels or log_file are not provided, they will be read from environment variables.
Parameters:
category_levels (Dict[str, int] | None): A dictionary mapping categories to their log levels.
If None, reads from LLAMA_STACK_LOGGING environment variable and uses defaults.
log_file (str | None): Path to a log file to additionally pipe the logs into.
If None, reads from LLAMA_STACK_LOG_FILE environment variable.
"""
global _category_levels
# Read from environment variables if not explicitly provided
if category_levels is None:
category_levels = dict.fromkeys(CATEGORIES, DEFAULT_LOG_LEVEL)
env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
if env_config:
category_levels.update(parse_environment_config(env_config))
# Update the module-level _category_levels so that already-created loggers pick up the new levels
_category_levels.update(category_levels)
if log_file is None:
log_file = os.environ.get("LLAMA_STACK_LOG_FILE")
log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s"
class CategoryFilter(logging.Filter):
"""Ensure category is always present in log records."""
def filter(self, record):
if not hasattr(record, "category"):
record.category = UNCATEGORIZED # Default to 'uncategorized' if no category found
return True
# Determine the root logger's level (default to WARNING if not specified)
root_level = category_levels.get("root", logging.WARNING)
handlers = {
"console": {
"()": CustomRichHandler, # Use custom console handler
"formatter": "rich",
"rich_tracebacks": True,
"show_time": False,
"show_path": False,
"markup": True,
"filters": ["category_filter"],
}
}
# Add a file handler if log_file is set
if log_file:
handlers["file"] = {
"()": CustomFileHandler,
"filename": log_file,
"mode": "a",
"encoding": "utf-8",
}
logging_config = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"rich": {
"()": logging.Formatter,
"format": log_format,
}
},
"handlers": handlers,
"filters": {
"category_filter": {
"()": CategoryFilter,
}
},
"loggers": {
**{
category: {
"handlers": list(handlers.keys()), # Apply all handlers
"level": category_levels.get(category, DEFAULT_LOG_LEVEL),
"propagate": False, # Disable propagation to root logger
}
for category in CATEGORIES
},
# Explicitly configure uvicorn loggers to preserve their INFO level
"uvicorn": {
"handlers": list(handlers.keys()),
"level": logging.INFO,
"propagate": False,
},
"uvicorn.error": {
"handlers": list(handlers.keys()),
"level": logging.INFO,
"propagate": False,
},
"uvicorn.access": {
"handlers": list(handlers.keys()),
"level": logging.INFO,
"propagate": False,
},
},
"root": {
"handlers": list(handlers.keys()),
"level": root_level, # Set root logger's level dynamically
},
}
dictConfig(logging_config)
# Update log levels for all loggers that were created before setup_logging was called
for name, logger in logging.root.manager.loggerDict.items():
if isinstance(logger, logging.Logger):
# Skip infrastructure loggers (uvicorn, fastapi) to preserve their configured levels
if name.startswith(("uvicorn", "fastapi")):
continue
# Update llama_stack loggers if root level was explicitly set (e.g., via all=CRITICAL)
if name.startswith("llama_stack") and "root" in category_levels:
logger.setLevel(root_level)
# Update third-party library loggers
elif not name.startswith("llama_stack"):
logger.setLevel(root_level)
def get_logger(
name: str, category: str = "uncategorized", config: LoggingConfig | None | None = None
) -> logging.LoggerAdapter:
"""
Returns a logger with the specified name and category.
If no category is provided, defaults to 'uncategorized'.
Parameters:
name (str): The name of the logger (e.g., module or filename).
category (str): The category of the logger (default 'uncategorized').
config (Logging): optional yaml config to override the existing logger configuration
Returns:
logging.LoggerAdapter: Configured logger with category support.
"""
if config:
_category_levels.update(parse_yaml_config(config))
logger = logging.getLogger(name)
if category in _category_levels:
log_level = _category_levels[category]
else:
root_category = category.split("::")[0]
if root_category in _category_levels:
log_level = _category_levels[root_category]
else:
if category != UNCATEGORIZED:
raise ValueError(
f"Unknown logging category: {category}. To resolve, choose a valid category from the CATEGORIES list "
f"or add it to the CATEGORIES list. Available categories: {CATEGORIES}"
)
log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
logger.setLevel(log_level)
return logging.LoggerAdapter(logger, {"category": category})