mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
# What does this PR do? query_metrics currently has no implementation, meaning once a metric is emitted there is no way in llama stack to query it from the store. implement query_metrics for the meta_reference provider which follows a similar style to `query_traces`, using the trace_store to format an SQL query and execute it in this case the parameters for the query are `metric.METRIC_NAME, start_time, and end_time` and any other matchers if they are provided. this required client side changes since the client had no `query_metrics` or any associated resources, so any tests here will fail but I will provide manual execution logs for the new tests I am adding order the metrics by timestamp. Additionally add `unit` to the `MetricDataPoint` class since this adds much more context to the metric being queried. depends on https://github.com/llamastack/llama-stack-client-python/pull/260 ## Test Plan ``` import time import uuid def create_http_client(): from llama_stack_client import LlamaStackClient return LlamaStackClient(base_url="http://localhost:8321") client = create_http_client() response = client.telemetry.query_metrics(metric_name="total_tokens", start_time=0) print(response) ``` ``` ╰─ python3.12 ~/telemetry.py INFO:httpx:HTTP Request: POST http://localhost:8322/v1/telemetry/metrics/total_tokens "HTTP/1.1 200 OK" [TelemetryQueryMetricsResponse(data=None, metric='total_tokens', labels=[], values=[{'timestamp': 1753999514, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999816, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999881, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1753999956, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754000200, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754000419, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000714, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000876, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754000908, 'value': 34.0, 'unit': 'tokens'}, {'timestamp': 1754001309, 'value': 584.0, 'unit': 'tokens'}, {'timestamp': 1754001311, 'value': 138.0, 'unit': 'tokens'}, {'timestamp': 1754001316, 'value': 349.0, 'unit': 'tokens'}, {'timestamp': 1754001318, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001320, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001341, 'value': 923.0, 'unit': 'tokens'}, {'timestamp': 1754001350, 'value': 354.0, 'unit': 'tokens'}, {'timestamp': 1754001462, 'value': 417.0, 'unit': 'tokens'}, {'timestamp': 1754001464, 'value': 158.0, 'unit': 'tokens'}, {'timestamp': 1754001475, 'value': 697.0, 'unit': 'tokens'}, {'timestamp': 1754001477, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001479, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001489, 'value': 298.0, 'unit': 'tokens'}, {'timestamp': 1754001541, 'value': 615.0, 'unit': 'tokens'}, {'timestamp': 1754001543, 'value': 119.0, 'unit': 'tokens'}, {'timestamp': 1754001548, 'value': 310.0, 'unit': 'tokens'}, {'timestamp': 1754001549, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001551, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001568, 'value': 714.0, 'unit': 'tokens'}, {'timestamp': 1754001800, 'value': 437.0, 'unit': 'tokens'}, {'timestamp': 1754001802, 'value': 200.0, 'unit': 'tokens'}, {'timestamp': 1754001806, 'value': 262.0, 'unit': 'tokens'}, {'timestamp': 1754001808, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001810, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001816, 'value': 82.0, 'unit': 'tokens'}, {'timestamp': 1754001923, 'value': 61.0, 'unit': 'tokens'}, {'timestamp': 1754001929, 'value': 391.0, 'unit': 'tokens'}, {'timestamp': 1754001939, 'value': 598.0, 'unit': 'tokens'}, {'timestamp': 1754001941, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001942, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754001952, 'value': 252.0, 'unit': 'tokens'}, {'timestamp': 1754002053, 'value': 251.0, 'unit': 'tokens'}, {'timestamp': 1754002059, 'value': 375.0, 'unit': 'tokens'}, {'timestamp': 1754002062, 'value': 244.0, 'unit': 'tokens'}, {'timestamp': 1754002064, 'value': 111.0, 'unit': 'tokens'}, {'timestamp': 1754002065, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754002083, 'value': 719.0, 'unit': 'tokens'}, {'timestamp': 1754002302, 'value': 279.0, 'unit': 'tokens'}, {'timestamp': 1754002306, 'value': 218.0, 'unit': 'tokens'}, {'timestamp': 1754002308, 'value': 198.0, 'unit': 'tokens'}, {'timestamp': 1754002309, 'value': 69.0, 'unit': 'tokens'}, {'timestamp': 1754002311, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754002324, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754003161, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003161, 'value': 69.0, 'unit': 'tokens'}, {'timestamp': 1754003169, 'value': 499.0, 'unit': 'tokens'}, {'timestamp': 1754003171, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003173, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003185, 'value': 422.0, 'unit': 'tokens'}, {'timestamp': 1754003448, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003453, 'value': 422.0, 'unit': 'tokens'}, {'timestamp': 1754003589, 'value': 579.0, 'unit': 'tokens'}, {'timestamp': 1754003609, 'value': 279.0, 'unit': 'tokens'}, {'timestamp': 1754003614, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754003706, 'value': 303.0, 'unit': 'tokens'}, {'timestamp': 1754003706, 'value': 51.0, 'unit': 'tokens'}, {'timestamp': 1754003713, 'value': 426.0, 'unit': 'tokens'}, {'timestamp': 1754003714, 'value': 70.0, 'unit': 'tokens'}, {'timestamp': 1754003715, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754003724, 'value': 225.0, 'unit': 'tokens'}, {'timestamp': 1754004226, 'value': 516.0, 'unit': 'tokens'}, {'timestamp': 1754004228, 'value': 127.0, 'unit': 'tokens'}, {'timestamp': 1754004232, 'value': 281.0, 'unit': 'tokens'}, {'timestamp': 1754004234, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004236, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004244, 'value': 206.0, 'unit': 'tokens'}, {'timestamp': 1754004683, 'value': 338.0, 'unit': 'tokens'}, {'timestamp': 1754004690, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754004692, 'value': 124.0, 'unit': 'tokens'}, {'timestamp': 1754004692, 'value': 65.0, 'unit': 'tokens'}, {'timestamp': 1754004694, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754004703, 'value': 211.0, 'unit': 'tokens'}, {'timestamp': 1754004743, 'value': 338.0, 'unit': 'tokens'}, {'timestamp': 1754004749, 'value': 211.0, 'unit': 'tokens'}, {'timestamp': 1754005566, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754006101, 'value': 159.0, 'unit': 'tokens'}, {'timestamp': 1754006105, 'value': 272.0, 'unit': 'tokens'}, {'timestamp': 1754006109, 'value': 308.0, 'unit': 'tokens'}, {'timestamp': 1754006110, 'value': 61.0, 'unit': 'tokens'}, {'timestamp': 1754006112, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754006130, 'value': 705.0, 'unit': 'tokens'}, {'timestamp': 1754051825, 'value': 454.0, 'unit': 'tokens'}, {'timestamp': 1754051827, 'value': 152.0, 'unit': 'tokens'}, {'timestamp': 1754051834, 'value': 481.0, 'unit': 'tokens'}, {'timestamp': 1754051835, 'value': 55.0, 'unit': 'tokens'}, {'timestamp': 1754051837, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754051845, 'value': 102.0, 'unit': 'tokens'}, {'timestamp': 1754099929, 'value': 36.0, 'unit': 'tokens'}, {'timestamp': 1754510050, 'value': 598.0, 'unit': 'tokens'}, {'timestamp': 1754510052, 'value': 160.0, 'unit': 'tokens'}, {'timestamp': 1754510064, 'value': 725.0, 'unit': 'tokens'}, {'timestamp': 1754510065, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754510067, 'value': 133.0, 'unit': 'tokens'}, {'timestamp': 1754510083, 'value': 535.0, 'unit': 'tokens'}, {'timestamp': 1754596582, 'value': 36.0, 'unit': 'tokens'}])] ``` adding tests for each currently documented metric in llama stack using this new function. attached is also some manual testing integrations tests passing locally with replay mode and the linked client changes: <img width="1907" height="529" alt="Screenshot 2025-08-08 at 2 49 14 PM" src="https://github.com/user-attachments/assets/d482ab06-dcff-4f0c-a1f1-f870670ee9bc" /> --------- Signed-off-by: Charlie Doern <cdoern@redhat.com>
209 lines
8 KiB
Python
209 lines
8 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
import time
|
|
from datetime import UTC, datetime, timedelta
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(scope="module", autouse=True)
|
|
def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id):
|
|
"""Setup fixture that creates telemetry metrics data before tests run."""
|
|
|
|
# Skip OpenAI tests if running in library mode
|
|
if not hasattr(client_with_models, "base_url"):
|
|
pytest.skip("OpenAI client tests not supported with library client")
|
|
|
|
prompt_tokens = []
|
|
completion_tokens = []
|
|
total_tokens = []
|
|
|
|
# Create OpenAI completions to generate metrics using the proper OpenAI client
|
|
for i in range(5):
|
|
response = openai_client.chat.completions.create(
|
|
model=text_model_id,
|
|
messages=[{"role": "user", "content": f"OpenAI test {i}"}],
|
|
stream=False,
|
|
)
|
|
prompt_tokens.append(response.usage.prompt_tokens)
|
|
completion_tokens.append(response.usage.completion_tokens)
|
|
total_tokens.append(response.usage.total_tokens)
|
|
|
|
# Wait for metrics to be logged
|
|
start_time = time.time()
|
|
while time.time() - start_time < 30:
|
|
try:
|
|
# Try to query metrics to see if they're available
|
|
metrics_response = client_with_models.telemetry.query_metrics(
|
|
metric_name="completion_tokens",
|
|
start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()),
|
|
)
|
|
if len(metrics_response[0].values) > 0:
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(1)
|
|
|
|
# Wait additional time to ensure all metrics are processed
|
|
time.sleep(5)
|
|
|
|
# Return the token lists for use in tests
|
|
return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
|
"""Test that prompt_tokens metrics are queryable."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
response = client_with_models.telemetry.query_metrics(
|
|
metric_name="prompt_tokens",
|
|
start_time=start_time,
|
|
)
|
|
|
|
assert isinstance(response, list)
|
|
|
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
|
|
|
assert response[0].metric == "prompt_tokens"
|
|
|
|
# Use the actual values from setup instead of hardcoded values
|
|
expected_values = setup_telemetry_metrics_data["prompt_tokens"]
|
|
assert response[0].values[-1].value in expected_values, (
|
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
|
"""Test that completion_tokens metrics are queryable."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
response = client_with_models.telemetry.query_metrics(
|
|
metric_name="completion_tokens",
|
|
start_time=start_time,
|
|
)
|
|
|
|
assert isinstance(response, list)
|
|
|
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
|
|
|
assert response[0].metric == "completion_tokens"
|
|
|
|
# Use the actual values from setup instead of hardcoded values
|
|
expected_values = setup_telemetry_metrics_data["completion_tokens"]
|
|
assert response[0].values[-1].value in expected_values, (
|
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
|
"""Test that total_tokens metrics are queryable."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
response = client_with_models.telemetry.query_metrics(
|
|
metric_name="total_tokens",
|
|
start_time=start_time,
|
|
)
|
|
|
|
assert isinstance(response, list)
|
|
|
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
|
|
|
assert response[0].metric == "total_tokens"
|
|
|
|
# Use the actual values from setup instead of hardcoded values
|
|
expected_values = setup_telemetry_metrics_data["total_tokens"]
|
|
assert response[0].values[-1].value in expected_values, (
|
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
|
)
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_with_time_range(llama_stack_client, text_model_id):
|
|
"""Test that metrics are queryable with time range."""
|
|
end_time = int(datetime.now(UTC).timestamp())
|
|
start_time = end_time - 600 # 10 minutes ago
|
|
|
|
response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="prompt_tokens",
|
|
start_time=start_time,
|
|
end_time=end_time,
|
|
)
|
|
|
|
assert isinstance(response, list)
|
|
|
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
|
|
|
assert response[0].metric == "prompt_tokens"
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id):
|
|
"""Test that metrics are queryable with label matchers."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="prompt_tokens",
|
|
start_time=start_time,
|
|
label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}],
|
|
)
|
|
|
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_nonexistent_metric(llama_stack_client):
|
|
"""Test that querying a nonexistent metric returns empty data."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="nonexistent_metric",
|
|
start_time=start_time,
|
|
)
|
|
|
|
assert isinstance(response, list), "Should return an empty list for nonexistent metric"
|
|
assert len(response) == 0
|
|
|
|
|
|
@pytest.mark.skip(reason="Skipping this test until client is regenerated")
|
|
def test_query_metrics_with_granularity(llama_stack_client, text_model_id):
|
|
"""Test that metrics are queryable with different granularity levels."""
|
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
|
|
|
# Test hourly granularity
|
|
hourly_response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="total_tokens",
|
|
start_time=start_time,
|
|
granularity="1h",
|
|
)
|
|
|
|
# Test daily granularity
|
|
daily_response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="total_tokens",
|
|
start_time=start_time,
|
|
granularity="1d",
|
|
)
|
|
|
|
# Test no granularity (raw data points)
|
|
raw_response = llama_stack_client.telemetry.query_metrics(
|
|
metric_name="total_tokens",
|
|
start_time=start_time,
|
|
granularity=None,
|
|
)
|
|
|
|
# All should return valid data
|
|
assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data"
|
|
assert isinstance(daily_response[0].values, list), "Daily granularity should return data"
|
|
assert isinstance(raw_response[0].values, list), "No granularity should return data"
|
|
|
|
# Verify that different granularities produce different aggregation levels
|
|
# (The exact number depends on data distribution, but they should be queryable)
|
|
assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable"
|
|
assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable"
|
|
assert len(raw_response[0].values) >= 0, "No granularity should be queryable"
|