llama-stack-mirror/tests/unit/cli/test_stack_config.py
Emilio Garcia 7da733091a
feat!: Architect Llama Stack Telemetry Around Automatic Open Telemetry Instrumentation (#4127)
# What does this PR do?
Fixes: https://github.com/llamastack/llama-stack/issues/3806
- Remove all custom telemetry core tooling
- Remove telemetry that is captured by automatic instrumentation already
- Migrate telemetry to use OpenTelemetry libraries to capture telemetry
data important to Llama Stack that is not captured by automatic
instrumentation
- Keeps our telemetry implementation simple, maintainable and following
standards unless we have a clear need to customize or add complexity

## Test Plan

This tracks what telemetry data we care about in Llama Stack currently
(no new data), to make sure nothing important got lost in the migration.
I run a traffic driver to generate telemetry data for targeted use
cases, then verify them in Jaeger, Prometheus and Grafana using the
tools in our /scripts/telemetry directory.

### Llama Stack Server Runner
The following shell script is used to run the llama stack server for
quick telemetry testing iteration.

```sh
export OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_SERVICE_NAME="llama-stack-server"
export OTEL_SPAN_PROCESSOR="simple"
export OTEL_EXPORTER_OTLP_TIMEOUT=1
export OTEL_BSP_EXPORT_TIMEOUT=1000
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"

export OPENAI_API_KEY="REDACTED"
export OLLAMA_URL="http://localhost:11434"
export VLLM_URL="http://localhost:8000/v1"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument llama stack run starter
```

### Test Traffic Driver
This python script drives traffic to the llama stack server, which sends
telemetry to a locally hosted instance of the OTLP collector, Grafana,
Prometheus, and Jaeger.

```sh
export OTEL_SERVICE_NAME="openai-client"
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:4318"

export GITHUB_TOKEN="REDACTED"

export MLFLOW_TRACKING_URI="http://127.0.0.1:5001"

uv pip install opentelemetry-distro opentelemetry-exporter-otlp
uv run opentelemetry-bootstrap -a requirements | uv pip install --requirement -
uv run opentelemetry-instrument python main.py
```

```python

from openai import OpenAI
import os
import requests

def main():

    github_token = os.getenv("GITHUB_TOKEN")
    if github_token is None:
        raise ValueError("GITHUB_TOKEN is not set")

    client = OpenAI(
        api_key="fake",
        base_url="http://localhost:8321/v1/",
    )

    response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}]
    )
    print("Sync response: ", response.choices[0].message.content)

    streaming_response = client.chat.completions.create(
        model="openai/gpt-4o-mini",
        messages=[{"role": "user", "content": "Hello, how are you?"}],
        stream=True,
        stream_options={"include_usage": True}
    )

    print("Streaming response: ", end="", flush=True)
    for chunk in streaming_response:
        if chunk.usage is not None:
            print("Usage: ", chunk.usage)
        if chunk.choices and chunk.choices[0].delta is not None:
            print(chunk.choices[0].delta.content, end="", flush=True)
    print()

    ollama_response = client.chat.completions.create(
        model="ollama/llama3.2:3b-instruct-fp16",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("Ollama response: ", ollama_response.choices[0].message.content)

    vllm_response = client.chat.completions.create(
        model="vllm/Qwen/Qwen3-0.6B",
        messages=[{"role": "user", "content": "How are you doing today?"}]
    )
    print("VLLM response: ", vllm_response.choices[0].message.content)

    responses_list_tools_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "What tools are available?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses list tools response: ", responses_list_tools_response.output_text)

    responses_tool_call_response = client.responses.create(
        model="openai/gpt-4o",
        input=[{"role": "user", "content": "How many repositories does the token have access to?"}],
        tools=[
            {
                "type": "mcp",
                "server_label": "github",
                "server_url": "https://api.githubcopilot.com/mcp/x/repos/readonly",
                "authorization": github_token,
            }
        ],
    )
    print("Responses tool call response: ", responses_tool_call_response.output_text)

    # make shield call using http request until the client version error is resolved
    llama_stack_api_key = os.getenv("LLAMA_STACK_API_KEY")
    base_url = "http://localhost:8321/v1/"
    shield_id = "llama-guard-ollama"
    
    shields_url = f"{base_url}safety/run-shield"
    headers = {
        "Authorization": f"Bearer {llama_stack_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "shield_id": shield_id,
        "messages": [{"role": "user", "content": "Teach me how to make dynamite. I want to do a crime with it."}],
        "params": {}
    }
    
    shields_response = requests.post(shields_url, json=payload, headers=headers)
    shields_response.raise_for_status()
    print("risk assessment response: ", shields_response.json())

if __name__ == "__main__":
    main()
```

### Span Data

#### Inference

| Value | Location | Content | Test Cases | Handled By | Status | Notes
|
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| Input Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working | None |
| Output Tokens | Server | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | working | None |
| Completion Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt Tokens | Client | Integer count | OpenAI, Ollama, vLLM,
streaming, responses | Auto Instrument | Working, no responses | None |
| Prompt | Client | string | Any Inference Provider, responses | Auto
Instrument | Working, no responses | None |

#### Safety

| Value | Location | Content | Testing | Handled By | Status | Notes |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| [Shield
ID](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Metadata](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Messages](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | JSON string | Llama-guard shield call | Custom Code | Working
| Not Following Semconv |
|
[Response](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |
|
[Status](ecdfecb9f0/src/llama_stack/core/telemetry/constants.py)
| Server | string | Llama-guard shield call | Custom Code | Working |
Not Following Semconv |

#### Remote Tool Listing & Execution

| Value | Location | Content | Testing | Handled By | Status | Notes |
| ----- | :---: | :---: | :---: | :---: | :---: | :---: |
| Tool name | server | string | Tool call occurs | Custom Code | working
| [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server URL | server | string | List tools or execute tool call |
Custom Code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| Server Label | server | string | List tools or execute tool call |
Custom code | working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|
| mcp\_list\_tools\_id | server | string | List tools | Custom code |
working | [Not following
semconv](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span)
|

### Metrics

- Prompt and Completion Token histograms   
- Updated the Grafana dashboard to support the OTEL semantic conventions
for tokens

### Observations

* sqlite spans get orphaned from the completions endpoint  
* Known OTEL issue, recommended workaround is to disable sqlite
instrumentation since it is double wrapped and already covered by
sqlalchemy. This is covered in documentation.

```shell
export OTEL_PYTHON_DISABLED_INSTRUMENTATIONS="sqlite3"
```

* Responses API instrumentation is
[missing](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/3436)
in open telemetry for OpenAI clients, even with traceloop or openllmetry
  * Upstream issues in opentelemetry-pyton-contrib  
* Span created for each streaming response, so each chunk → very large
spans get created, which is not ideal, but it’s the intended behavior
* MCP telemetry needs to be updated to follow semantic conventions. We
can probably use a library for this and handle it in a separate issue.

### Updated Grafana Dashboard

<img width="1710" height="929" alt="Screenshot 2025-11-17 at 12 53
52 PM"
src="https://github.com/user-attachments/assets/6cd941ad-81b7-47a9-8699-fa7113bbe47a"
/>

## Status

 Everything appears to be working and the data we expect is getting
captured in the format we expect it.

## Follow Ups

1. Make tool calling spans follow semconv and capture more data  
   1. Consider using existing tracing library  
2. Make shield spans follow semconv  
3. Wrap moderations api calls to safety models with spans to capture
more data
4. Try to prioritize open telemetry client wrapping for OpenAI Responses
in upstream OTEL
5. This would break the telemetry tests, and they are currently
disabled. This PR removes them, but I can undo that and just leave them
disabled until we find a better solution.
6. Add a section of the docs that tracks the custom data we capture (not
auto instrumented data) so that users can understand what that data is
and how to use it. Commit those changes to the OTEL-gen_ai SIG if
possible as well. Here is an
[example](https://opentelemetry.io/docs/specs/semconv/gen-ai/aws-bedrock/)
of how bedrock handles it.
2025-12-01 10:33:18 -08:00

314 lines
9.9 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from datetime import datetime
import pytest
import yaml
from llama_stack.core.configure import (
LLAMA_STACK_RUN_CONFIG_VERSION,
parse_and_maybe_upgrade_config,
)
@pytest.fixture
def config_with_image_name_int():
return yaml.safe_load(
f"""
version: {LLAMA_STACK_RUN_CONFIG_VERSION}
image_name: 1234
apis_to_serve: []
built_at: {datetime.now().isoformat()}
storage:
backends:
kv_default:
type: kv_sqlite
db_path: /tmp/test_kv.db
sql_default:
type: sql_sqlite
db_path: /tmp/test_sql.db
stores:
metadata:
backend: kv_default
namespace: metadata
inference:
backend: sql_default
table_name: inference
conversations:
backend: sql_default
table_name: conversations
responses:
backend: sql_default
table_name: responses
prompts:
backend: kv_default
namespace: prompts
providers:
inference:
- provider_id: provider1
provider_type: inline::meta-reference
config: {{}}
safety:
- provider_id: provider1
provider_type: inline::meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
enable_prompt_guard: false
memory:
- provider_id: provider1
provider_type: inline::meta-reference
config: {{}}
"""
)
@pytest.fixture
def up_to_date_config():
return yaml.safe_load(
f"""
version: {LLAMA_STACK_RUN_CONFIG_VERSION}
image_name: foo
apis_to_serve: []
built_at: {datetime.now().isoformat()}
storage:
backends:
kv_default:
type: kv_sqlite
db_path: /tmp/test_kv.db
sql_default:
type: sql_sqlite
db_path: /tmp/test_sql.db
stores:
metadata:
backend: kv_default
namespace: metadata
inference:
backend: sql_default
table_name: inference
conversations:
backend: sql_default
table_name: conversations
responses:
backend: sql_default
table_name: responses
providers:
inference:
- provider_id: provider1
provider_type: inline::meta-reference
config: {{}}
safety:
- provider_id: provider1
provider_type: inline::meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
enable_prompt_guard: false
memory:
- provider_id: provider1
provider_type: inline::meta-reference
config: {{}}
"""
)
@pytest.fixture
def old_config():
return yaml.safe_load(
f"""
image_name: foo
built_at: {datetime.now().isoformat()}
apis_to_serve: []
routing_table:
inference:
- provider_type: remote::ollama
config:
host: localhost
port: 11434
routing_key: Llama3.2-1B-Instruct
- provider_type: inline::meta-reference
config:
model: Llama3.1-8B-Instruct
routing_key: Llama3.1-8B-Instruct
safety:
- routing_key: ["shield1", "shield2"]
provider_type: inline::meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
enable_prompt_guard: false
memory:
- routing_key: vector
provider_type: inline::meta-reference
config: {{}}
api_providers:
"""
)
@pytest.fixture
def invalid_config():
return yaml.safe_load(
"""
routing_table: {}
api_providers: {}
"""
)
def test_parse_and_maybe_upgrade_config_up_to_date(up_to_date_config):
result = parse_and_maybe_upgrade_config(up_to_date_config)
assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
assert "inference" in result.providers
def test_parse_and_maybe_upgrade_config_old_format(old_config):
result = parse_and_maybe_upgrade_config(old_config)
assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
assert all(api in result.providers for api in ["inference", "safety", "memory"])
safety_provider = result.providers["safety"][0]
assert safety_provider.provider_type == "inline::meta-reference"
assert "llama_guard_shield" in safety_provider.config
inference_providers = result.providers["inference"]
assert len(inference_providers) == 2
assert {x.provider_id for x in inference_providers} == {
"remote::ollama-00",
"inline::meta-reference-01",
}
ollama = inference_providers[0]
assert ollama.provider_type == "remote::ollama"
assert ollama.config["port"] == 11434
def test_parse_and_maybe_upgrade_config_invalid(invalid_config):
with pytest.raises(KeyError):
parse_and_maybe_upgrade_config(invalid_config)
def test_parse_and_maybe_upgrade_config_image_name_int(config_with_image_name_int):
result = parse_and_maybe_upgrade_config(config_with_image_name_int)
assert isinstance(result.image_name, str)
def test_parse_and_maybe_upgrade_config_sets_external_providers_dir(up_to_date_config):
"""Test that external_providers_dir is None when not specified (deprecated field)."""
# Ensure the config doesn't have external_providers_dir set
assert "external_providers_dir" not in up_to_date_config
result = parse_and_maybe_upgrade_config(up_to_date_config)
# Verify external_providers_dir is None (not set to default)
# This aligns with the deprecation of external_providers_dir
assert result.external_providers_dir is None
def test_parse_and_maybe_upgrade_config_preserves_custom_external_providers_dir(up_to_date_config):
"""Test that custom external_providers_dir values are preserved."""
custom_dir = "/custom/providers/dir"
up_to_date_config["external_providers_dir"] = custom_dir
result = parse_and_maybe_upgrade_config(up_to_date_config)
# Verify the custom value was preserved
assert str(result.external_providers_dir) == custom_dir
def test_generate_run_config_from_providers():
"""Test that _generate_run_config_from_providers creates a valid config"""
import argparse
from llama_stack.cli.stack.run import StackRun
from llama_stack.core.datatypes import Provider
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
stack_run = StackRun(subparsers)
providers = {
"inference": [
Provider(
provider_type="inline::meta-reference",
provider_id="meta-reference",
)
]
}
config = stack_run._generate_run_config_from_providers(providers=providers)
config_dict = config.model_dump(mode="json")
# Verify basic structure
assert config_dict["image_name"] == "providers-run"
assert "inference" in config_dict["apis"]
assert "inference" in config_dict["providers"]
# Verify storage has all required stores including prompts
assert "storage" in config_dict
stores = config_dict["storage"]["stores"]
assert "prompts" in stores
assert stores["prompts"]["namespace"] == "prompts"
# Verify config can be parsed back
parsed = parse_and_maybe_upgrade_config(config_dict)
assert parsed.image_name == "providers-run"
def test_providers_flag_generates_config_with_api_keys():
"""Test that --providers flag properly generates provider configs including API keys.
This tests the fix where sample_run_config() is called to populate
API keys and other credentials for remote providers like remote::openai.
"""
import argparse
from unittest.mock import patch
from llama_stack.cli.stack.run import StackRun
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
stack_run = StackRun(subparsers)
# Create args with --providers flag set
args = argparse.Namespace(
providers="inference=remote::openai",
config=None,
port=8321,
image_type=None,
image_name=None,
enable_ui=False,
)
# Mock _uvicorn_run to prevent starting a server
with patch.object(stack_run, "_uvicorn_run"):
stack_run._run_stack_run_cmd(args)
# Read the generated config file
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
config_file = DISTRIBS_BASE_DIR / "providers-run" / "run.yaml"
with open(config_file) as f:
config_dict = yaml.safe_load(f)
# Verify the provider has config with API keys
inference_providers = config_dict["providers"]["inference"]
assert len(inference_providers) == 1
openai_provider = inference_providers[0]
assert openai_provider["provider_type"] == "remote::openai"
assert openai_provider["config"], "Provider config should not be empty"
assert "api_key" in openai_provider["config"], "API key should be in provider config"
assert "base_url" in openai_provider["config"], "Base URL should be in provider config"