llama-stack-mirror/tests/integration/telemetry/test_openai_telemetry.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

import time
from datetime import UTC, datetime

import pytest


@pytest.fixture(scope="module", autouse=True)
def setup_openai_telemetry_data(llama_stack_client, text_model_id):
    """Setup fixture that creates telemetry data specifically for OpenAI completions testing."""

    # Create OpenAI completion traces
    for i in range(3):
        llama_stack_client.chat.completions.create(
            model=text_model_id,
            messages=[
                {
                    "role": "user",
                    "content": f"Test trace openai {i}",
                }
            ],
            # stream=False to always capture Metrics.
            stream=False,
        )

    # Create additional OpenAI completion traces with different parameters
    for i in range(2):
        llama_stack_client.chat.completions.create(
            model=text_model_id,
            messages=[
                {
                    "role": "user",
                    "content": f"Test trace openai with temperature {i}",
                }
            ],
            temperature=0.7,
            max_tokens=100,
            stream=False,
        )

    start_time = time.time()

    while time.time() - start_time < 30:
        traces = llama_stack_client.telemetry.query_traces(limit=10)
        if len(traces) >= 5:  # 5 OpenAI completion traces
            break
        time.sleep(0.1)

    if len(traces) < 5:
        pytest.fail(
            f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
        )

    yield


def test_openai_traces_basic(llama_stack_client):
    """Test basic trace querying functionality for OpenAI completions."""
    all_traces = llama_stack_client.telemetry.query_traces(limit=10)

    assert isinstance(all_traces, list), "Should return a list of traces"
    assert len(all_traces) >= 5, "Should have at least 5 traces from OpenAI setup"

    # Verify trace structure and data quality
    first_trace = all_traces[0]
    assert hasattr(first_trace, "trace_id"), "Trace should have trace_id"
    assert hasattr(first_trace, "start_time"), "Trace should have start_time"
    assert hasattr(first_trace, "root_span_id"), "Trace should have root_span_id"

    # Validate trace_id is a valid UUID format
    assert isinstance(first_trace.trace_id, str) and len(first_trace.trace_id) > 0, (
        "trace_id should be non-empty string"
    )

    # Validate start_time format and not in the future
    now = datetime.now(UTC)
    if isinstance(first_trace.start_time, str):
        trace_time = datetime.fromisoformat(first_trace.start_time.replace("Z", "+00:00"))
    else:
        # start_time is already a datetime object
        trace_time = first_trace.start_time
        if trace_time.tzinfo is None:
            trace_time = trace_time.replace(tzinfo=UTC)

    # Ensure trace time is not in the future
    time_diff = (now - trace_time).total_seconds()
    assert time_diff >= 0, f"Trace start_time should not be in the future, got {time_diff}s"

    # Validate root_span_id exists and is non-empty
    assert isinstance(first_trace.root_span_id, str) and len(first_trace.root_span_id) > 0, (
        "root_span_id should be non-empty string"
    )

    # Test querying specific trace by ID
    specific_trace = llama_stack_client.telemetry.get_trace(trace_id=first_trace.trace_id)
    assert specific_trace.trace_id == first_trace.trace_id, "Retrieved trace should match requested ID"
    assert specific_trace.start_time == first_trace.start_time, "Retrieved trace should have same start_time"
    assert specific_trace.root_span_id == first_trace.root_span_id, "Retrieved trace should have same root_span_id"

    # Test pagination with proper validation
    recent_traces = llama_stack_client.telemetry.query_traces(limit=3, offset=0)
    assert len(recent_traces) <= 3, "Should return at most 3 traces when limit=3"
    assert len(recent_traces) >= 1, "Should return at least 1 trace"

    # Verify all traces have required fields
    for trace in recent_traces:
        assert hasattr(trace, "trace_id") and trace.trace_id, "Each trace should have non-empty trace_id"
        assert hasattr(trace, "start_time") and trace.start_time, "Each trace should have non-empty start_time"
        assert hasattr(trace, "root_span_id") and trace.root_span_id, "Each trace should have non-empty root_span_id"


def test_openai_spans_basic(llama_stack_client):
    """Test basic span querying functionality for OpenAI completions."""
    spans = llama_stack_client.telemetry.query_spans(attribute_filters=[], attributes_to_return=[])

    assert isinstance(spans, list), "Should return a list of spans"
    assert len(spans) >= 1, "Should have at least one span from OpenAI setup"

    # Verify span structure and data quality
    first_span = spans[0]
    required_attrs = ["span_id", "name", "trace_id"]
    for attr in required_attrs:
        assert hasattr(first_span, attr), f"Span should have {attr} attribute"
        assert getattr(first_span, attr), f"Span {attr} should not be empty"

    # Validate span data types and content
    assert isinstance(first_span.span_id, str) and len(first_span.span_id) > 0, "span_id should be non-empty string"
    assert isinstance(first_span.name, str) and len(first_span.name) > 0, "span name should be non-empty string"
    assert isinstance(first_span.trace_id, str) and len(first_span.trace_id) > 0, "trace_id should be non-empty string"

    # Verify span belongs to a valid trace
    all_traces = llama_stack_client.telemetry.query_traces(limit=10)
    trace_ids = {t.trace_id for t in all_traces}
    if first_span.trace_id in trace_ids:
        trace = llama_stack_client.telemetry.get_trace(trace_id=first_span.trace_id)
        assert trace is not None, "Should be able to retrieve trace for valid trace_id"
        assert trace.trace_id == first_span.trace_id, "Trace ID should match span's trace_id"

    # Test with span filtering and validate results
    filtered_spans = llama_stack_client.telemetry.query_spans(
        attribute_filters=[{"key": "name", "op": "eq", "value": first_span.name}],
        attributes_to_return=["name", "span_id"],
    )
    assert isinstance(filtered_spans, list), "Should return a list with span name filter"

    # Validate filtered spans if filtering works
    if len(filtered_spans) > 0:
        for span in filtered_spans:
            assert hasattr(span, "name"), "Filtered spans should have name attribute"
            assert hasattr(span, "span_id"), "Filtered spans should have span_id attribute"
            assert span.name == first_span.name, "Filtered spans should match the filter criteria"
            assert isinstance(span.span_id, str) and len(span.span_id) > 0, "Filtered span_id should be valid"

    # Test that all spans have consistent structure
    for span in spans:
        for attr in required_attrs:
            assert hasattr(span, attr) and getattr(span, attr), f"All spans should have non-empty {attr}"


def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
    """Test that making OpenAI completion calls actually creates telemetry data."""

    # Get initial trace count
    initial_traces = llama_stack_client.telemetry.query_traces(limit=20)
    initial_count = len(initial_traces)

    # Make a new OpenAI completion call
    response = llama_stack_client.chat.completions.create(
        model=text_model_id,
        messages=[{"role": "user", "content": "Test OpenAI telemetry creation"}],
        stream=False,
    )

    # Verify we got a response
    assert response is not None, "Should get a response from OpenAI completion"
    assert hasattr(response, "choices"), "Response should have choices"
    assert len(response.choices) > 0, "Response should have at least one choice"

    # Wait for telemetry to be recorded
    start_time = time.time()
    while time.time() - start_time < 30:
        final_traces = llama_stack_client.telemetry.query_traces(limit=20)
        final_count = len(final_traces)
        if final_count > initial_count:
            break
        time.sleep(0.1)

    # Should have at least as many traces as before (might have more due to other activity)
    assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"