diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 0549dda21..99375056c 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -16022,12 +16022,16 @@
"value": {
"type": "number",
"description": "The numeric value of the metric at this timestamp"
+ },
+ "unit": {
+ "type": "string"
}
},
"additionalProperties": false,
"required": [
"timestamp",
- "value"
+ "value",
+ "unit"
],
"title": "MetricDataPoint",
"description": "A single data point in a metric time series."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index aa47cd58d..1b19dae7d 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -11921,10 +11921,13 @@ components:
type: number
description: >-
The numeric value of the metric at this timestamp
+ unit:
+ type: string
additionalProperties: false
required:
- timestamp
- value
+ - unit
title: MetricDataPoint
description: >-
A single data point in a metric time series.
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index 92422ac1b..8d1b5d697 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel):
timestamp: int
value: float
+ unit: str
@json_schema_type
@@ -518,7 +519,7 @@ class Telemetry(Protocol):
metric_name: str,
start_time: int,
end_time: int | None = None,
- granularity: str | None = "1d",
+ granularity: str | None = None,
query_type: MetricQueryType = MetricQueryType.RANGE,
label_matchers: list[MetricLabelMatcher] | None = None,
) -> QueryMetricsResponse:
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index d99255c79..cdd2b9f9a 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
+import datetime
import logging
import threading
from typing import Any
@@ -145,11 +146,41 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
metric_name: str,
start_time: int,
end_time: int | None = None,
- granularity: str | None = "1d",
+ granularity: str | None = None,
query_type: MetricQueryType = MetricQueryType.RANGE,
label_matchers: list[MetricLabelMatcher] | None = None,
) -> QueryMetricsResponse:
- raise NotImplementedError("Querying metrics is not implemented")
+ """Query metrics from the telemetry store.
+
+ Args:
+ metric_name: The name of the metric to query (e.g., "prompt_tokens")
+ start_time: Start time as Unix timestamp
+ end_time: End time as Unix timestamp (defaults to now if None)
+ granularity: Time granularity for aggregation
+ query_type: Type of query (RANGE or INSTANT)
+ label_matchers: Label filters to apply
+
+ Returns:
+ QueryMetricsResponse with metric time series data
+ """
+ # Convert timestamps to datetime objects
+ start_dt = datetime.datetime.fromtimestamp(start_time, datetime.UTC)
+ end_dt = datetime.datetime.fromtimestamp(end_time, datetime.UTC) if end_time else None
+
+ # Use SQLite trace store if available
+ if hasattr(self, "trace_store") and self.trace_store:
+ return await self.trace_store.query_metrics(
+ metric_name=metric_name,
+ start_time=start_dt,
+ end_time=end_dt,
+ granularity=granularity,
+ query_type=query_type,
+ label_matchers=label_matchers,
+ )
+ else:
+ raise ValueError(
+ f"In order to query_metrics, you must have {TelemetrySink.SQLITE} set in your telemetry sinks"
+ )
def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
with self._lock:
diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
index 8dd6061a6..71480364c 100644
--- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
+++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py
@@ -5,12 +5,23 @@
# the root directory of this source tree.
import json
-from datetime import datetime
+from datetime import UTC, datetime
from typing import Protocol
import aiosqlite
-from llama_stack.apis.telemetry import QueryCondition, Span, SpanWithStatus, Trace
+from llama_stack.apis.telemetry import (
+ MetricDataPoint,
+ MetricLabel,
+ MetricLabelMatcher,
+ MetricQueryType,
+ MetricSeries,
+ QueryCondition,
+ QueryMetricsResponse,
+ Span,
+ SpanWithStatus,
+ Trace,
+)
class TraceStore(Protocol):
@@ -29,11 +40,192 @@ class TraceStore(Protocol):
max_depth: int | None = None,
) -> dict[str, SpanWithStatus]: ...
+ async def query_metrics(
+ self,
+ metric_name: str,
+ start_time: datetime,
+ end_time: datetime | None = None,
+ granularity: str | None = "1d",
+ query_type: MetricQueryType = MetricQueryType.RANGE,
+ label_matchers: list[MetricLabelMatcher] | None = None,
+ ) -> QueryMetricsResponse: ...
+
class SQLiteTraceStore(TraceStore):
def __init__(self, conn_string: str):
self.conn_string = conn_string
+ async def query_metrics(
+ self,
+ metric_name: str,
+ start_time: datetime,
+ end_time: datetime | None = None,
+ granularity: str | None = None,
+ query_type: MetricQueryType = MetricQueryType.RANGE,
+ label_matchers: list[MetricLabelMatcher] | None = None,
+ ) -> QueryMetricsResponse:
+ if end_time is None:
+ end_time = datetime.now(UTC)
+
+ # Build base query
+ if query_type == MetricQueryType.INSTANT:
+ query = """
+ SELECT
+ se.name,
+ SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
+ json_extract(se.attributes, '$.unit') as unit,
+ se.attributes
+ FROM span_events se
+ WHERE se.name = ?
+ AND se.timestamp BETWEEN ? AND ?
+ """
+ else:
+ if granularity:
+ time_format = self._get_time_format_for_granularity(granularity)
+ query = f"""
+ SELECT
+ se.name,
+ SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value,
+ json_extract(se.attributes, '$.unit') as unit,
+ se.attributes,
+ strftime('{time_format}', se.timestamp) as bucket_start
+ FROM span_events se
+ WHERE se.name = ?
+ AND se.timestamp BETWEEN ? AND ?
+ """
+ else:
+ query = """
+ SELECT
+ se.name,
+ json_extract(se.attributes, '$.value') as value,
+ json_extract(se.attributes, '$.unit') as unit,
+ se.attributes,
+ se.timestamp
+ FROM span_events se
+ WHERE se.name = ?
+ AND se.timestamp BETWEEN ? AND ?
+ """
+
+ params = [f"metric.{metric_name}", start_time.isoformat(), end_time.isoformat()]
+
+ # Labels that will be attached to the MetricSeries (preserve matcher labels)
+ all_labels: list[MetricLabel] = []
+ matcher_label_names = set()
+ if label_matchers:
+ for matcher in label_matchers:
+ json_path = f"$.{matcher.name}"
+ if matcher.operator == "=":
+ query += f" AND json_extract(se.attributes, '{json_path}') = ?"
+ params.append(matcher.value)
+ elif matcher.operator == "!=":
+ query += f" AND json_extract(se.attributes, '{json_path}') != ?"
+ params.append(matcher.value)
+ elif matcher.operator == "=~":
+ query += f" AND json_extract(se.attributes, '{json_path}') LIKE ?"
+ params.append(f"%{matcher.value}%")
+ elif matcher.operator == "!~":
+ query += f" AND json_extract(se.attributes, '{json_path}') NOT LIKE ?"
+ params.append(f"%{matcher.value}%")
+ # Preserve filter context in output
+ all_labels.append(MetricLabel(name=matcher.name, value=str(matcher.value)))
+ matcher_label_names.add(matcher.name)
+
+ # GROUP BY / ORDER BY logic
+ if query_type == MetricQueryType.RANGE and granularity:
+ group_time_format = self._get_time_format_for_granularity(granularity)
+ query += f" GROUP BY strftime('{group_time_format}', se.timestamp), json_extract(se.attributes, '$.unit')"
+ query += " ORDER BY bucket_start"
+ elif query_type == MetricQueryType.INSTANT:
+ query += " GROUP BY json_extract(se.attributes, '$.unit')"
+ else:
+ query += " ORDER BY se.timestamp"
+
+ # Execute query
+ async with aiosqlite.connect(self.conn_string) as conn:
+ conn.row_factory = aiosqlite.Row
+ async with conn.execute(query, params) as cursor:
+ rows = await cursor.fetchall()
+
+ if not rows:
+ return QueryMetricsResponse(data=[])
+
+ data_points = []
+ # We want to add attribute labels, but only those not already present as matcher labels.
+ attr_label_names = set()
+ for row in rows:
+ # Parse JSON attributes safely, if there are no attributes (weird), just don't add the labels to the result.
+ try:
+ attributes = json.loads(row["attributes"] or "{}")
+ except (TypeError, json.JSONDecodeError):
+ attributes = {}
+
+ value = row["value"]
+ unit = row["unit"] or ""
+
+ # Add labels from attributes without duplicating matcher labels, if we don't do this, there will be a lot of duplicate label in the result.
+ for k, v in attributes.items():
+ if k not in ["value", "unit"] and k not in matcher_label_names and k not in attr_label_names:
+ all_labels.append(MetricLabel(name=k, value=str(v)))
+ attr_label_names.add(k)
+
+ # Determine timestamp
+ if query_type == MetricQueryType.RANGE and granularity:
+ try:
+ bucket_start_raw = row["bucket_start"]
+ except KeyError as e:
+ raise ValueError(
+ "DB did not have a bucket_start time in row when using granularity, this indicates improper formatting"
+ ) from e
+ # this value could also be there, but be NULL, I think.
+ if bucket_start_raw is None:
+ raise ValueError("bucket_start is None check time format and data")
+ bucket_start = datetime.fromisoformat(bucket_start_raw)
+ timestamp = int(bucket_start.timestamp())
+ elif query_type == MetricQueryType.INSTANT:
+ timestamp = int(datetime.now(UTC).timestamp())
+ else:
+ try:
+ timestamp_raw = row["timestamp"]
+ except KeyError as e:
+ raise ValueError(
+ "DB did not have a timestamp in row, this indicates improper formatting"
+ ) from e
+ # this value could also be there, but be NULL, I think.
+ if timestamp_raw is None:
+ raise ValueError("timestamp is None check time format and data")
+ timestamp_iso = datetime.fromisoformat(timestamp_raw)
+ timestamp = int(timestamp_iso.timestamp())
+
+ data_points.append(
+ MetricDataPoint(
+ timestamp=timestamp,
+ value=value,
+ unit=unit,
+ )
+ )
+
+ metric_series = [MetricSeries(metric=metric_name, labels=all_labels, values=data_points)]
+ return QueryMetricsResponse(data=metric_series)
+
+ def _get_time_format_for_granularity(self, granularity: str | None) -> str:
+ """Get the SQLite strftime format string for a given granularity.
+ Args:
+ granularity: Granularity string (e.g., "1m", "5m", "1h", "1d")
+ Returns:
+ SQLite strftime format string for the granularity
+ """
+ if granularity is None:
+ raise ValueError("granularity cannot be None for this method - use separate logic for no aggregation")
+
+ if granularity.endswith("d"):
+ return "%Y-%m-%d 00:00:00"
+ elif granularity.endswith("h"):
+ return "%Y-%m-%d %H:00:00"
+ elif granularity.endswith("m"):
+ return "%Y-%m-%d %H:%M:00"
+ else:
+ return "%Y-%m-%d %H:%M:00" # Default to most granular which will give us the most timestamps.
+
async def query_traces(
self,
attribute_filters: list[QueryCondition] | None = None,
diff --git a/tests/integration/recordings/index.sqlite b/tests/integration/recordings/index.sqlite
index 6f73bb3a0..089f55273 100644
Binary files a/tests/integration/recordings/index.sqlite and b/tests/integration/recordings/index.sqlite differ
diff --git a/tests/integration/recordings/responses/390f0c7dac96.json b/tests/integration/recordings/responses/390f0c7dac96.json
new file mode 100644
index 000000000..e8c9528fb
--- /dev/null
+++ b/tests/integration/recordings/responses/390f0c7dac96.json
@@ -0,0 +1,39 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/api/generate",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b-instruct-fp16",
+ "raw": true,
+ "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+ "options": {
+ "temperature": 0.0
+ },
+ "stream": false
+ },
+ "endpoint": "/api/generate",
+ "model": "llama3.2:3b-instruct-fp16"
+ },
+ "response": {
+ "body": {
+ "__type__": "ollama._types.GenerateResponse",
+ "__data__": {
+ "model": "llama3.2:3b-instruct-fp16",
+ "created_at": "2025-08-11T15:51:18.170868Z",
+ "done": true,
+ "done_reason": "stop",
+ "total_duration": 5240614083,
+ "load_duration": 9823416,
+ "prompt_eval_count": 21,
+ "prompt_eval_duration": 21000000,
+ "eval_count": 310,
+ "eval_duration": 5209000000,
+ "response": "This is the start of a test. I'll provide some sample data and you can try to generate metrics based on it.\n\n**Data:**\n\nLet's say we have a dataset of user interactions with an e-commerce website. The data includes:\n\n| User ID | Product Name | Purchase Date | Quantity | Price |\n| --- | --- | --- | --- | --- |\n| 1 | iPhone 13 | 2022-01-01 | 2 | 999.99 |\n| 1 | MacBook Air | 2022-01-05 | 1 | 1299.99 |\n| 2 | Samsung TV | 2022-01-10 | 3 | 899.99 |\n| 3 | iPhone 13 | 2022-01-15 | 1 | 999.99 |\n| 4 | MacBook Pro | 2022-01-20 | 2 | 1799.99 |\n\n**Task:**\n\nYour task is to generate the following metrics based on this data:\n\n1. Average order value (AOV)\n2. Conversion rate\n3. Average revenue per user (ARPU)\n4. Customer lifetime value (CLV)\n\nPlease provide your answers in a format like this:\n\n| Metric | Value |\n| --- | --- |\n| AOV | 1234.56 |\n| Conversion Rate | 0.25 |\n| ARPU | 1000.00 |\n| CLV | 5000.00 |\n\nGo ahead and generate the metrics!",
+ "thinking": null,
+ "context": null
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/4de6877d86fa.json b/tests/integration/recordings/responses/4de6877d86fa.json
new file mode 100644
index 000000000..c33176b04
--- /dev/null
+++ b/tests/integration/recordings/responses/4de6877d86fa.json
@@ -0,0 +1,56 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/v1/v1/chat/completions",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "OpenAI test 0"
+ }
+ ],
+ "stream": false
+ },
+ "endpoint": "/v1/chat/completions",
+ "model": "llama3.2:3b"
+ },
+ "response": {
+ "body": {
+ "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+ "__data__": {
+ "id": "chatcmpl-159",
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "logprobs": null,
+ "message": {
+ "content": "It seems like you're trying to access an internal system, but I don't have any specific information about OpenAI's internal testing environment or how to interact with it. However, I can tell you that OpenAI is a company that provides various AI-related products and services, including the popular language model called Llama.\n\nIf you're looking for information on how to use OpenAI's products or APIs, I'd be happy to help with that! Alternatively, if you have any specific questions about AI or machine learning, feel free to ask, and I'll do my best to provide a helpful response.",
+ "refusal": null,
+ "role": "assistant",
+ "annotations": null,
+ "audio": null,
+ "function_call": null,
+ "tool_calls": null
+ }
+ }
+ ],
+ "created": 1754927774,
+ "model": "llama3.2:3b",
+ "object": "chat.completion",
+ "service_tier": null,
+ "system_fingerprint": "fp_ollama",
+ "usage": {
+ "completion_tokens": 122,
+ "prompt_tokens": 30,
+ "total_tokens": 152,
+ "completion_tokens_details": null,
+ "prompt_tokens_details": null
+ }
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/5db0c44c83a4.json b/tests/integration/recordings/responses/5db0c44c83a4.json
new file mode 100644
index 000000000..c1a66e56a
--- /dev/null
+++ b/tests/integration/recordings/responses/5db0c44c83a4.json
@@ -0,0 +1,56 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/v1/v1/chat/completions",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b",
+ "messages": [
+ {
+ "role": "user",
+ "content": "OpenAI test 1"
+ }
+ ],
+ "stream": false
+ },
+ "endpoint": "/v1/chat/completions",
+ "model": "llama3.2:3b"
+ },
+ "response": {
+ "body": {
+ "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+ "__data__": {
+ "id": "chatcmpl-831",
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "logprobs": null,
+ "message": {
+ "content": "I'm ready to help. What would you like to do? Would you like to:\n\n1. Ask a question\n2. Engage in a conversation\n3. Test my language understanding\n4. Something else (please specify)",
+ "refusal": null,
+ "role": "assistant",
+ "annotations": null,
+ "audio": null,
+ "function_call": null,
+ "tool_calls": null
+ }
+ }
+ ],
+ "created": 1754927774,
+ "model": "llama3.2:3b",
+ "object": "chat.completion",
+ "service_tier": null,
+ "system_fingerprint": "fp_ollama",
+ "usage": {
+ "completion_tokens": 48,
+ "prompt_tokens": 30,
+ "total_tokens": 78,
+ "completion_tokens_details": null,
+ "prompt_tokens_details": null
+ }
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/7bcb0f86c91b.json b/tests/integration/recordings/responses/7bcb0f86c91b.json
new file mode 100644
index 000000000..4c9a55153
--- /dev/null
+++ b/tests/integration/recordings/responses/7bcb0f86c91b.json
@@ -0,0 +1,39 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/api/generate",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b-instruct-fp16",
+ "raw": true,
+ "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+ "options": {
+ "temperature": 0.0
+ },
+ "stream": false
+ },
+ "endpoint": "/api/generate",
+ "model": "llama3.2:3b-instruct-fp16"
+ },
+ "response": {
+ "body": {
+ "__type__": "ollama._types.GenerateResponse",
+ "__data__": {
+ "model": "llama3.2:3b-instruct-fp16",
+ "created_at": "2025-08-11T15:51:12.918723Z",
+ "done": true,
+ "done_reason": "stop",
+ "total_duration": 8868987792,
+ "load_duration": 2793275292,
+ "prompt_eval_count": 21,
+ "prompt_eval_duration": 250000000,
+ "eval_count": 344,
+ "eval_duration": 5823000000,
+ "response": "Here are some common test metrics used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions or classifications out of total predictions made.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to the sum of true positives and false positives (incorrectly predicted instances).\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: The square root of the mean of the squared percentage differences between predicted and actual values.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well a model fits the data, with higher values indicating better fit.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Normalized Mean Squared Error (NMSE)**: Similar to MSE, but normalized by the mean of the actual values.\n\nThese metrics can be used for various types of data, including:\n\n* Regression problems (e.g., predicting continuous values)\n* Classification problems (e.g., predicting categorical labels)\n* Time series forecasting\n* Clustering and dimensionality reduction\n\nWhen choosing a metric, consider the specific problem you're trying to solve, the type of data, and the desired level of precision.",
+ "thinking": null,
+ "context": null
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/c31a86ea6c58.json b/tests/integration/recordings/responses/c31a86ea6c58.json
new file mode 100644
index 000000000..b8d109ddd
--- /dev/null
+++ b/tests/integration/recordings/responses/c31a86ea6c58.json
@@ -0,0 +1,39 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/api/generate",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b",
+ "raw": true,
+ "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+ "options": {
+ "temperature": 0.0
+ },
+ "stream": false
+ },
+ "endpoint": "/api/generate",
+ "model": "llama3.2:3b"
+ },
+ "response": {
+ "body": {
+ "__type__": "ollama._types.GenerateResponse",
+ "__data__": {
+ "model": "llama3.2:3b",
+ "created_at": "2025-08-11T15:56:06.703788Z",
+ "done": true,
+ "done_reason": "stop",
+ "total_duration": 2722294000,
+ "load_duration": 9736083,
+ "prompt_eval_count": 21,
+ "prompt_eval_duration": 113000000,
+ "eval_count": 324,
+ "eval_duration": 2598000000,
+ "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions made by the model.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to total positive predictions.\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses the error as a percentage.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2, MSLE\n* Time series forecasting: MAPE, RMSPE\n\nNote that the choice of metric depends on the specific problem and data.",
+ "thinking": null,
+ "context": null
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/d0ac68cbde69.json b/tests/integration/recordings/responses/d0ac68cbde69.json
index e9a939aa0..f99c7ca38 100644
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@@ -16,7 +16,7 @@
"model": "llama3.2:3b",
"name": "llama3.2:3b",
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
- "expires_at": "2025-08-06T15:57:21.573326-04:00",
+ "expires_at": "2025-08-11T11:58:17.058489-04:00",
"size": 4030033920,
"size_vram": 4030033920,
"details": {
@@ -28,7 +28,27 @@
],
"parameter_size": "3.2B",
"quantization_level": "Q4_K_M"
- }
+ },
+ "context_length": null
+ },
+ {
+ "model": "llama3.2:3b-instruct-fp16",
+ "name": "llama3.2:3b-instruct-fp16",
+ "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+ "expires_at": "2025-08-11T11:56:22.727361-04:00",
+ "size": 8581748736,
+ "size_vram": 8581748736,
+ "details": {
+ "parent_model": "",
+ "format": "gguf",
+ "family": "llama",
+ "families": [
+ "llama"
+ ],
+ "parameter_size": "3.2B",
+ "quantization_level": "F16"
+ },
+ "context_length": null
}
]
}
diff --git a/tests/integration/recordings/responses/f6857bcea729.json b/tests/integration/recordings/responses/f6857bcea729.json
new file mode 100644
index 000000000..404bfb987
--- /dev/null
+++ b/tests/integration/recordings/responses/f6857bcea729.json
@@ -0,0 +1,39 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/api/generate",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b",
+ "raw": true,
+ "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 2<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+ "options": {
+ "temperature": 0.0
+ },
+ "stream": false
+ },
+ "endpoint": "/api/generate",
+ "model": "llama3.2:3b"
+ },
+ "response": {
+ "body": {
+ "__type__": "ollama._types.GenerateResponse",
+ "__data__": {
+ "model": "llama3.2:3b",
+ "created_at": "2025-08-11T15:56:13.082679Z",
+ "done": true,
+ "done_reason": "stop",
+ "total_duration": 2606245291,
+ "load_duration": 9979708,
+ "prompt_eval_count": 21,
+ "prompt_eval_duration": 23000000,
+ "eval_count": 321,
+ "eval_duration": 2572000000,
+ "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: Measures how close the predicted values are to the actual values.\n2. **Precision**: Measures the proportion of true positives among all positive predictions made by the model.\n3. **Recall**: Measures the proportion of true positives among all actual positive instances.\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses errors as a percentage of the actual value.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: Measures the average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2\n* Time series forecasting: MAPE, MSLE\n\nNote that the choice of metric depends on the specific problem and data.",
+ "thinking": null,
+ "context": null
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/recordings/responses/f80b99430f7e.json b/tests/integration/recordings/responses/f80b99430f7e.json
new file mode 100644
index 000000000..5b692f4ca
--- /dev/null
+++ b/tests/integration/recordings/responses/f80b99430f7e.json
@@ -0,0 +1,39 @@
+{
+ "request": {
+ "method": "POST",
+ "url": "http://localhost:11434/api/generate",
+ "headers": {},
+ "body": {
+ "model": "llama3.2:3b",
+ "raw": true,
+ "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+ "options": {
+ "temperature": 0.0
+ },
+ "stream": false
+ },
+ "endpoint": "/api/generate",
+ "model": "llama3.2:3b"
+ },
+ "response": {
+ "body": {
+ "__type__": "ollama._types.GenerateResponse",
+ "__data__": {
+ "model": "llama3.2:3b",
+ "created_at": "2025-08-11T15:56:10.465932Z",
+ "done": true,
+ "done_reason": "stop",
+ "total_duration": 3745686709,
+ "load_duration": 9734584,
+ "prompt_eval_count": 21,
+ "prompt_eval_duration": 23000000,
+ "eval_count": 457,
+ "eval_duration": 3712000000,
+ "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n**Primary Metrics**\n\n1. **Response Time**: The time it takes for the system to respond to a request.\n2. **Throughput**: The number of requests processed by the system per unit time (e.g., requests per second).\n3. **Error Rate**: The percentage of requests that result in an error.\n\n**Secondary Metrics**\n\n1. **Average Response Time**: The average response time for all requests.\n2. **Median Response Time**: The middle value of the response times, used to detect outliers.\n3. **99th Percentile Response Time**: The response time at which 99% of requests are completed within this time.\n4. **Request Latency**: The difference between the request arrival time and the response time.\n\n**User Experience Metrics**\n\n1. **User Satisfaction (USAT)**: Measured through surveys or feedback forms to gauge user satisfaction with the system's performance.\n2. **First Response Time**: The time it takes for a user to receive their first response from the system.\n3. **Time Spent in System**: The total amount of time a user spends interacting with the system.\n\n**System Resource Metrics**\n\n1. **CPU Utilization**: The percentage of CPU resources being used by the system.\n2. **Memory Usage**: The amount of memory being used by the system.\n3. **Disk I/O Wait Time**: The average time spent waiting for disk I/O operations to complete.\n\n**Security Metrics**\n\n1. **Authentication Success Rate**: The percentage of successful authentication attempts.\n2. **Authorization Success Rate**: The percentage of successful authorization attempts.\n3. **Error Rate (Security)**: The percentage of security-related errors.\n\n**Other Metrics**\n\n1. **Page Load Time**: The time it takes for a page to load.\n2. **Click-Through Rate (CTR)**: The percentage of users who click on a link or button after seeing an ad or notification.\n3. **Conversion Rate**: The percentage of users who complete a desired action (e.g., fill out a form, make a purchase).\n\nThese metrics can be used to evaluate the performance and effectiveness of various aspects of your system, from user experience to security and resource utilization.",
+ "thinking": null,
+ "context": null
+ }
+ },
+ "is_streaming": false
+ }
+}
diff --git a/tests/integration/telemetry/test_telemetry_metrics.py b/tests/integration/telemetry/test_telemetry_metrics.py
new file mode 100644
index 000000000..5e3b42ea2
--- /dev/null
+++ b/tests/integration/telemetry/test_telemetry_metrics.py
@@ -0,0 +1,208 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from datetime import UTC, datetime, timedelta
+
+import pytest
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id):
+ """Setup fixture that creates telemetry metrics data before tests run."""
+
+ # Skip OpenAI tests if running in library mode
+ if not hasattr(client_with_models, "base_url"):
+ pytest.skip("OpenAI client tests not supported with library client")
+
+ prompt_tokens = []
+ completion_tokens = []
+ total_tokens = []
+ for i in range(3):
+ response = client_with_models.inference.chat_completion(
+ model_id=text_model_id,
+ messages=[{"role": "user", "content": f"Test metrics generation {i}"}],
+ stream=False, # Ensure metrics are captured
+ )
+
+ # Create OpenAI completions to generate metrics using the proper OpenAI client
+ for i in range(2):
+ response = openai_client.chat.completions.create(
+ model=text_model_id,
+ messages=[{"role": "user", "content": f"OpenAI test {i}"}],
+ stream=False,
+ )
+ prompt_tokens.append(response.usage.prompt_tokens)
+ completion_tokens.append(response.usage.completion_tokens)
+ total_tokens.append(response.usage.total_tokens)
+
+ # Wait for metrics to be logged
+ start_time = time.time()
+ while time.time() - start_time < 30:
+ try:
+ # Try to query metrics to see if they're available
+ metrics_response = client_with_models.telemetry.query_metrics(
+ metric_name="completion_tokens",
+ start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()),
+ )
+ if len(metrics_response[0].values) > 0:
+ break
+ except Exception:
+ pass
+ time.sleep(1)
+
+ # Wait additional time to ensure all metrics are processed
+ time.sleep(5)
+
+ # Return the token lists for use in tests
+ return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
+
+
+def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+ """Test that prompt_tokens metrics are queryable."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ response = client_with_models.telemetry.query_metrics(
+ metric_name="prompt_tokens",
+ start_time=start_time,
+ )
+
+ assert isinstance(response, list)
+
+ assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+ assert response[0].metric == "prompt_tokens"
+
+ # Use the actual values from setup instead of hardcoded values
+ expected_values = setup_telemetry_metrics_data["prompt_tokens"]
+ assert response[0].values[-1].value in expected_values, (
+ f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+ )
+
+
+def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+ """Test that completion_tokens metrics are queryable."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ response = client_with_models.telemetry.query_metrics(
+ metric_name="completion_tokens",
+ start_time=start_time,
+ )
+
+ assert isinstance(response, list)
+
+ assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+ assert response[0].metric == "completion_tokens"
+
+ # Use the actual values from setup instead of hardcoded values
+ expected_values = setup_telemetry_metrics_data["completion_tokens"]
+ assert response[0].values[-1].value in expected_values, (
+ f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+ )
+
+
+def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
+ """Test that total_tokens metrics are queryable."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ response = client_with_models.telemetry.query_metrics(
+ metric_name="total_tokens",
+ start_time=start_time,
+ )
+
+ assert isinstance(response, list)
+
+ assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+ assert response[0].metric == "total_tokens"
+
+ # Use the actual values from setup instead of hardcoded values
+ expected_values = setup_telemetry_metrics_data["total_tokens"]
+ assert response[0].values[-1].value in expected_values, (
+ f"Expected one of {expected_values}, got {response[0].values[-1].value}"
+ )
+
+
+def test_query_metrics_with_time_range(llama_stack_client, text_model_id):
+ """Test that metrics are queryable with time range."""
+ end_time = int(datetime.now(UTC).timestamp())
+ start_time = end_time - 600 # 10 minutes ago
+
+ response = llama_stack_client.telemetry.query_metrics(
+ metric_name="prompt_tokens",
+ start_time=start_time,
+ end_time=end_time,
+ )
+
+ assert isinstance(response, list)
+
+ assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+ assert response[0].metric == "prompt_tokens"
+
+
+def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id):
+ """Test that metrics are queryable with label matchers."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ response = llama_stack_client.telemetry.query_metrics(
+ metric_name="prompt_tokens",
+ start_time=start_time,
+ label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}],
+ )
+
+ assert isinstance(response[0].values, list), "Should return a list of metric series"
+
+
+def test_query_metrics_nonexistent_metric(llama_stack_client):
+ """Test that querying a nonexistent metric returns empty data."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ response = llama_stack_client.telemetry.query_metrics(
+ metric_name="nonexistent_metric",
+ start_time=start_time,
+ )
+
+ assert isinstance(response, list), "Should return an empty list for nonexistent metric"
+ assert len(response) == 0
+
+
+def test_query_metrics_with_granularity(llama_stack_client, text_model_id):
+ """Test that metrics are queryable with different granularity levels."""
+ start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
+
+ # Test hourly granularity
+ hourly_response = llama_stack_client.telemetry.query_metrics(
+ metric_name="total_tokens",
+ start_time=start_time,
+ granularity="1h",
+ )
+
+ # Test daily granularity
+ daily_response = llama_stack_client.telemetry.query_metrics(
+ metric_name="total_tokens",
+ start_time=start_time,
+ granularity="1d",
+ )
+
+ # Test no granularity (raw data points)
+ raw_response = llama_stack_client.telemetry.query_metrics(
+ metric_name="total_tokens",
+ start_time=start_time,
+ granularity=None,
+ )
+
+ # All should return valid data
+ assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data"
+ assert isinstance(daily_response[0].values, list), "Daily granularity should return data"
+ assert isinstance(raw_response[0].values, list), "No granularity should return data"
+
+ # Verify that different granularities produce different aggregation levels
+ # (The exact number depends on data distribution, but they should be queryable)
+ assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable"
+ assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable"
+ assert len(raw_response[0].values) >= 0, "No granularity should be queryable"