llama-stack-mirror/tests/integration/telemetry/test_telemetry_metrics.py
Charlie Doern 3b9278f254
feat: implement query_metrics (#3074)
# What does this PR do?

query_metrics currently has no implementation, meaning once a metric is
emitted there is no way in llama stack to query it from the store.

implement query_metrics for the meta_reference provider which follows a
similar style to `query_traces`, using the trace_store to format an SQL
query and execute it

in this case the parameters for the query are `metric.METRIC_NAME,
start_time, and end_time` and any other matchers if they are provided.

this required client side changes since the client had no
`query_metrics` or any associated resources, so any tests here will fail
but I will provide manual execution logs for the new tests I am adding

order the metrics by timestamp.

Additionally add `unit` to the `MetricDataPoint` class since this adds
much more context to the metric being queried.


depends on
https://github.com/llamastack/llama-stack-client-python/pull/260

## Test Plan

```
import time
import uuid


def create_http_client():
    from llama_stack_client import LlamaStackClient

    return LlamaStackClient(base_url="http://localhost:8321")


client = create_http_client()

response = client.telemetry.query_metrics(metric_name="total_tokens", start_time=0)
print(response)
```

```
╰─ python3.12 ~/telemetry.py
INFO:httpx:HTTP Request: POST http://localhost:8322/v1/telemetry/metrics/total_tokens "HTTP/1.1 200 OK"
[TelemetryQueryMetricsResponse(data=None, metric='total_tokens', labels=[], values=[{'timestamp': 1753999514, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999816, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999881, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999956, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754000200, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754000419, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000714, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000876, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000908, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754001309, 'value': 584.0, 'unit': 'tokens'}, {'timestamp': 1754001311, 'value': 138.0, 'unit': 'tokens'}, {'timestamp': 1754001316, 'value': 349.0, 'unit': 'tokens'}, {'timestamp': 1754001318, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001320, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001341, 'value': 923.0, 'unit': 'tokens'}, {'timestamp': 1754001350, 'value': 354.0, 'unit': 'tokens'}, {'timestamp': 1754001462, 'value': 417.0, 'unit': 'tokens'}, {'timestamp': 1754001464, 'value': 158.0, 'unit': 'tokens'}, {'timestamp': 1754001475, 'value': 697.0, 'unit': 'tokens'}, {'timestamp': 1754001477, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001479, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001489, 'value': 298.0, 'unit': 'tokens'}, {'timestamp': 1754001541, 'value': 615.0, 'unit': 'tokens'}, {'timestamp': 1754001543, 'value': 119.0, 'unit': 'tokens'}, {'timestamp': 1754001548, 'value': 310.0, 'unit': 'tokens'}, {'timestamp': 1754001549, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001551, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001568, 'value': 714.0, 'unit': 'tokens'}, {'timestamp': 1754001800, 'value': 437.0, 'unit': 'tokens'}, {'timestamp': 1754001802, 'value': 200.0, 'unit': 'tokens'}, {'timestamp': 1754001806, 'value': 262.0, 'unit': 'tokens'}, {'timestamp': 1754001808, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001810, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001816, 'value': 82.0, 'unit': 'tokens'}, {'timestamp': 1754001923, 'value': 61.0, 'unit': 'tokens'}, {'timestamp': 1754001929, 'value': 391.0, 'unit': 'tokens'}, {'timestamp': 1754001939, 'value': 598.0, 'unit': 'tokens'}, {'timestamp': 1754001941, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001942, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001952, 'value': 252.0, 'unit': 'tokens'}, {'timestamp': 1754002053, 'value': 251.0, 'unit': 'tokens'}, {'timestamp': 1754002059, 'value': 375.0, 'unit': 'tokens'}, {'timestamp': 1754002062, 'value': 244.0, 'unit': 'tokens'}, {'timestamp': 1754002064, 'value': 111.0, 'unit': 'tokens'}, {'timestamp': 1754002065, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754002083, 'value': 719.0, 'unit': 'tokens'}, {'timestamp': 1754002302, 'value': 279.0, 'unit': 'tokens'}, {'timestamp': 1754002306, 'value': 218.0, 'unit': 'tokens'}, {'timestamp': 1754002308, 'value': 198.0, 'unit': 'tokens'}, {'timestamp': 1754002309, 'value': 69.0, 'unit': 'tokens'}, {'timestamp': 1754002311, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754002324, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754003161, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003161, 'value': 69.0, 'unit': 'tokens'}, {'timestamp': 1754003169, 'value': 499.0, 'unit': 'tokens'}, {'timestamp': 1754003171, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003173, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003185, 'value': 422.0, 'unit': 'tokens'}, {'timestamp': 1754003448, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003453, 'value': 422.0, 'unit': 'tokens'}, {'timestamp': 1754003589, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003609, 'value': 279.0, 'unit': 'tokens'}, {'timestamp': 1754003614, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754003706, 'value': 303.0, 'unit': 'tokens'}, {'timestamp': 1754003706, 'value': 51.0, 'unit': 'tokens'}, {'timestamp': 1754003713, 'value': 426.0, 'unit': 'tokens'}, {'timestamp': 1754003714, 'value': 70.0, 'unit': 'tokens'}, {'timestamp': 1754003715, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003724, 'value': 225.0, 'unit': 'tokens'}, {'timestamp': 1754004226, 'value': 516.0, 'unit': 'tokens'}, {'timestamp': 1754004228, 'value': 127.0, 'unit': 'tokens'}, {'timestamp': 1754004232, 'value': 281.0, 'unit': 'tokens'}, {'timestamp': 1754004234, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004236, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004244, 'value': 206.0, 'unit': 'tokens'}, {'timestamp': 1754004683, 'value': 338.0, 'unit': 'tokens'}, {'timestamp': 1754004690, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754004692, 'value': 124.0, 'unit': 'tokens'}, {'timestamp': 1754004692, 'value': 65.0, 'unit': 'tokens'}, {'timestamp': 1754004694, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004703, 'value': 211.0, 'unit': 'tokens'}, {'timestamp': 1754004743, 'value': 338.0, 'unit': 'tokens'}, {'timestamp': 1754004749, 'value': 211.0, 'unit': 'tokens'}, {'timestamp': 1754005566, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754006101, 'value': 159.0, 'unit': 'tokens'}, {'timestamp': 1754006105, 'value': 272.0, 'unit': 'tokens'}, {'timestamp': 1754006109, 'value': 308.0, 'unit': 'tokens'}, {'timestamp': 1754006110, 'value': 61.0, 'unit': 'tokens'}, {'timestamp': 1754006112, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754006130, 'value': 705.0, 'unit': 'tokens'}, {'timestamp': 1754051825, 'value': 454.0, 'unit': 'tokens'}, {'timestamp': 1754051827, 'value': 152.0, 'unit': 'tokens'}, {'timestamp': 1754051834, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754051835, 'value': 55.0, 'unit': 'tokens'}, {'timestamp': 1754051837, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754051845, 'value': 102.0, 'unit': 'tokens'}, {'timestamp': 1754099929, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754510050, 'value': 598.0, 'unit': 'tokens'}, {'timestamp': 1754510052, 'value': 160.0, 'unit': 'tokens'}, {'timestamp': 1754510064, 'value': 725.0, 'unit': 'tokens'}, {'timestamp': 1754510065, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754510067, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754510083, 'value': 535.0, 'unit': 'tokens'}, {'timestamp': 1754596582, 'value': 36.0, 'unit': 'tokens'}])]
```

adding tests for each currently documented metric in llama stack using
this new function. attached is also some manual testing


integrations tests passing locally with replay mode and the linked
client changes:
<img width="1907" height="529" alt="Screenshot 2025-08-08 at 2 49 14 PM"
src="https://github.com/user-attachments/assets/d482ab06-dcff-4f0c-a1f1-f870670ee9bc"
/>

---------

Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-08-22 14:19:24 -07:00

209 lines
8 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import time
from datetime import UTC, datetime, timedelta
import pytest
@pytest.fixture(scope="module", autouse=True)
def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id):
"""Setup fixture that creates telemetry metrics data before tests run."""
# Skip OpenAI tests if running in library mode
if not hasattr(client_with_models, "base_url"):
pytest.skip("OpenAI client tests not supported with library client")
prompt_tokens = []
completion_tokens = []
total_tokens = []
# Create OpenAI completions to generate metrics using the proper OpenAI client
for i in range(5):
response = openai_client.chat.completions.create(
model=text_model_id,
messages=[{"role": "user", "content": f"OpenAI test {i}"}],
stream=False,
)
prompt_tokens.append(response.usage.prompt_tokens)
completion_tokens.append(response.usage.completion_tokens)
total_tokens.append(response.usage.total_tokens)
# Wait for metrics to be logged
start_time = time.time()
while time.time() - start_time < 30:
try:
# Try to query metrics to see if they're available
metrics_response = client_with_models.telemetry.query_metrics(
metric_name="completion_tokens",
start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()),
)
if len(metrics_response[0].values) > 0:
break
except Exception:
pass
time.sleep(1)
# Wait additional time to ensure all metrics are processed
time.sleep(5)
# Return the token lists for use in tests
return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
"""Test that prompt_tokens metrics are queryable."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
response = client_with_models.telemetry.query_metrics(
metric_name="prompt_tokens",
start_time=start_time,
)
assert isinstance(response, list)
assert isinstance(response[0].values, list), "Should return a list of metric series"
assert response[0].metric == "prompt_tokens"
# Use the actual values from setup instead of hardcoded values
expected_values = setup_telemetry_metrics_data["prompt_tokens"]
assert response[0].values[-1].value in expected_values, (
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
)
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
"""Test that completion_tokens metrics are queryable."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
response = client_with_models.telemetry.query_metrics(
metric_name="completion_tokens",
start_time=start_time,
)
assert isinstance(response, list)
assert isinstance(response[0].values, list), "Should return a list of metric series"
assert response[0].metric == "completion_tokens"
# Use the actual values from setup instead of hardcoded values
expected_values = setup_telemetry_metrics_data["completion_tokens"]
assert response[0].values[-1].value in expected_values, (
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
)
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
"""Test that total_tokens metrics are queryable."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
response = client_with_models.telemetry.query_metrics(
metric_name="total_tokens",
start_time=start_time,
)
assert isinstance(response, list)
assert isinstance(response[0].values, list), "Should return a list of metric series"
assert response[0].metric == "total_tokens"
# Use the actual values from setup instead of hardcoded values
expected_values = setup_telemetry_metrics_data["total_tokens"]
assert response[0].values[-1].value in expected_values, (
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
)
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_with_time_range(llama_stack_client, text_model_id):
"""Test that metrics are queryable with time range."""
end_time = int(datetime.now(UTC).timestamp())
start_time = end_time - 600 # 10 minutes ago
response = llama_stack_client.telemetry.query_metrics(
metric_name="prompt_tokens",
start_time=start_time,
end_time=end_time,
)
assert isinstance(response, list)
assert isinstance(response[0].values, list), "Should return a list of metric series"
assert response[0].metric == "prompt_tokens"
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id):
"""Test that metrics are queryable with label matchers."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
response = llama_stack_client.telemetry.query_metrics(
metric_name="prompt_tokens",
start_time=start_time,
label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}],
)
assert isinstance(response[0].values, list), "Should return a list of metric series"
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_nonexistent_metric(llama_stack_client):
"""Test that querying a nonexistent metric returns empty data."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
response = llama_stack_client.telemetry.query_metrics(
metric_name="nonexistent_metric",
start_time=start_time,
)
assert isinstance(response, list), "Should return an empty list for nonexistent metric"
assert len(response) == 0
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
def test_query_metrics_with_granularity(llama_stack_client, text_model_id):
"""Test that metrics are queryable with different granularity levels."""
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
# Test hourly granularity
hourly_response = llama_stack_client.telemetry.query_metrics(
metric_name="total_tokens",
start_time=start_time,
granularity="1h",
)
# Test daily granularity
daily_response = llama_stack_client.telemetry.query_metrics(
metric_name="total_tokens",
start_time=start_time,
granularity="1d",
)
# Test no granularity (raw data points)
raw_response = llama_stack_client.telemetry.query_metrics(
metric_name="total_tokens",
start_time=start_time,
granularity=None,
)
# All should return valid data
assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data"
assert isinstance(daily_response[0].values, list), "Daily granularity should return data"
assert isinstance(raw_response[0].values, list), "No granularity should return data"
# Verify that different granularities produce different aggregation levels
# (The exact number depends on data distribution, but they should be queryable)
assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable"
assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable"
assert len(raw_response[0].values) >= 0, "No granularity should be queryable"