From c1a9a21380f6d9e22346f3cf9cf85ecc0003fd0c Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Thu, 7 Aug 2025 20:03:58 -0400 Subject: [PATCH 1/2] feat: implement query_metrics query_metrics currently has no implementation, meaning once a metric is emitted there is no way in llama stack to query it from the store. implement query_metrics for the meta_reference provider which follows a similar style to `query_traces`, using the trace_store to format an SQL query and execute it in this case the parameters for the query are `metric.METRIC_NAME, start_time, and end_time`. this required client side changes since the client had no `query_metrics` or any associated resources, so any tests here will fail but I will provider manual execution logs for the new tests I am adding order the metrics by timestamp. Additionally add `unit` to the `MetricDataPoint` class since this adds much more context to the metric being queried. these metrics can also be aggregated via a `granularity` parameter. This was pre-defined as a string like: `1m, 1h, 1d` where metrics occuring in same timespan specified are aggregated together. Signed-off-by: Charlie Doern --- docs/_static/llama-stack-spec.html | 6 +- docs/_static/llama-stack-spec.yaml | 3 + llama_stack/apis/telemetry/telemetry.py | 3 +- .../telemetry/meta_reference/telemetry.py | 35 +++- .../utils/telemetry/sqlite_trace_store.py | 196 +++++++++++++++++- 5 files changed, 237 insertions(+), 6 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index e2c53d4b0..722c30fd6 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -15885,12 +15885,16 @@ "value": { "type": "number", "description": "The numeric value of the metric at this timestamp" + }, + "unit": { + "type": "string" } }, "additionalProperties": false, "required": [ "timestamp", - "value" + "value", + "unit" ], "title": "MetricDataPoint", "description": "A single data point in a metric time series." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 85cec3a78..e54d7c971 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -11810,10 +11810,13 @@ components: type: number description: >- The numeric value of the metric at this timestamp + unit: + type: string additionalProperties: false required: - timestamp - value + - unit title: MetricDataPoint description: >- A single data point in a metric time series. diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 92422ac1b..8d1b5d697 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel): timestamp: int value: float + unit: str @json_schema_type @@ -518,7 +519,7 @@ class Telemetry(Protocol): metric_name: str, start_time: int, end_time: int | None = None, - granularity: str | None = "1d", + granularity: str | None = None, query_type: MetricQueryType = MetricQueryType.RANGE, label_matchers: list[MetricLabelMatcher] | None = None, ) -> QueryMetricsResponse: diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index d99255c79..cdd2b9f9a 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import datetime import logging import threading from typing import Any @@ -145,11 +146,41 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): metric_name: str, start_time: int, end_time: int | None = None, - granularity: str | None = "1d", + granularity: str | None = None, query_type: MetricQueryType = MetricQueryType.RANGE, label_matchers: list[MetricLabelMatcher] | None = None, ) -> QueryMetricsResponse: - raise NotImplementedError("Querying metrics is not implemented") + """Query metrics from the telemetry store. + + Args: + metric_name: The name of the metric to query (e.g., "prompt_tokens") + start_time: Start time as Unix timestamp + end_time: End time as Unix timestamp (defaults to now if None) + granularity: Time granularity for aggregation + query_type: Type of query (RANGE or INSTANT) + label_matchers: Label filters to apply + + Returns: + QueryMetricsResponse with metric time series data + """ + # Convert timestamps to datetime objects + start_dt = datetime.datetime.fromtimestamp(start_time, datetime.UTC) + end_dt = datetime.datetime.fromtimestamp(end_time, datetime.UTC) if end_time else None + + # Use SQLite trace store if available + if hasattr(self, "trace_store") and self.trace_store: + return await self.trace_store.query_metrics( + metric_name=metric_name, + start_time=start_dt, + end_time=end_dt, + granularity=granularity, + query_type=query_type, + label_matchers=label_matchers, + ) + else: + raise ValueError( + f"In order to query_metrics, you must have {TelemetrySink.SQLITE} set in your telemetry sinks" + ) def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None: with self._lock: diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py index 8dd6061a6..71480364c 100644 --- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py +++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py @@ -5,12 +5,23 @@ # the root directory of this source tree. import json -from datetime import datetime +from datetime import UTC, datetime from typing import Protocol import aiosqlite -from llama_stack.apis.telemetry import QueryCondition, Span, SpanWithStatus, Trace +from llama_stack.apis.telemetry import ( + MetricDataPoint, + MetricLabel, + MetricLabelMatcher, + MetricQueryType, + MetricSeries, + QueryCondition, + QueryMetricsResponse, + Span, + SpanWithStatus, + Trace, +) class TraceStore(Protocol): @@ -29,11 +40,192 @@ class TraceStore(Protocol): max_depth: int | None = None, ) -> dict[str, SpanWithStatus]: ... + async def query_metrics( + self, + metric_name: str, + start_time: datetime, + end_time: datetime | None = None, + granularity: str | None = "1d", + query_type: MetricQueryType = MetricQueryType.RANGE, + label_matchers: list[MetricLabelMatcher] | None = None, + ) -> QueryMetricsResponse: ... + class SQLiteTraceStore(TraceStore): def __init__(self, conn_string: str): self.conn_string = conn_string + async def query_metrics( + self, + metric_name: str, + start_time: datetime, + end_time: datetime | None = None, + granularity: str | None = None, + query_type: MetricQueryType = MetricQueryType.RANGE, + label_matchers: list[MetricLabelMatcher] | None = None, + ) -> QueryMetricsResponse: + if end_time is None: + end_time = datetime.now(UTC) + + # Build base query + if query_type == MetricQueryType.INSTANT: + query = """ + SELECT + se.name, + SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value, + json_extract(se.attributes, '$.unit') as unit, + se.attributes + FROM span_events se + WHERE se.name = ? + AND se.timestamp BETWEEN ? AND ? + """ + else: + if granularity: + time_format = self._get_time_format_for_granularity(granularity) + query = f""" + SELECT + se.name, + SUM(CAST(json_extract(se.attributes, '$.value') AS REAL)) as value, + json_extract(se.attributes, '$.unit') as unit, + se.attributes, + strftime('{time_format}', se.timestamp) as bucket_start + FROM span_events se + WHERE se.name = ? + AND se.timestamp BETWEEN ? AND ? + """ + else: + query = """ + SELECT + se.name, + json_extract(se.attributes, '$.value') as value, + json_extract(se.attributes, '$.unit') as unit, + se.attributes, + se.timestamp + FROM span_events se + WHERE se.name = ? + AND se.timestamp BETWEEN ? AND ? + """ + + params = [f"metric.{metric_name}", start_time.isoformat(), end_time.isoformat()] + + # Labels that will be attached to the MetricSeries (preserve matcher labels) + all_labels: list[MetricLabel] = [] + matcher_label_names = set() + if label_matchers: + for matcher in label_matchers: + json_path = f"$.{matcher.name}" + if matcher.operator == "=": + query += f" AND json_extract(se.attributes, '{json_path}') = ?" + params.append(matcher.value) + elif matcher.operator == "!=": + query += f" AND json_extract(se.attributes, '{json_path}') != ?" + params.append(matcher.value) + elif matcher.operator == "=~": + query += f" AND json_extract(se.attributes, '{json_path}') LIKE ?" + params.append(f"%{matcher.value}%") + elif matcher.operator == "!~": + query += f" AND json_extract(se.attributes, '{json_path}') NOT LIKE ?" + params.append(f"%{matcher.value}%") + # Preserve filter context in output + all_labels.append(MetricLabel(name=matcher.name, value=str(matcher.value))) + matcher_label_names.add(matcher.name) + + # GROUP BY / ORDER BY logic + if query_type == MetricQueryType.RANGE and granularity: + group_time_format = self._get_time_format_for_granularity(granularity) + query += f" GROUP BY strftime('{group_time_format}', se.timestamp), json_extract(se.attributes, '$.unit')" + query += " ORDER BY bucket_start" + elif query_type == MetricQueryType.INSTANT: + query += " GROUP BY json_extract(se.attributes, '$.unit')" + else: + query += " ORDER BY se.timestamp" + + # Execute query + async with aiosqlite.connect(self.conn_string) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(query, params) as cursor: + rows = await cursor.fetchall() + + if not rows: + return QueryMetricsResponse(data=[]) + + data_points = [] + # We want to add attribute labels, but only those not already present as matcher labels. + attr_label_names = set() + for row in rows: + # Parse JSON attributes safely, if there are no attributes (weird), just don't add the labels to the result. + try: + attributes = json.loads(row["attributes"] or "{}") + except (TypeError, json.JSONDecodeError): + attributes = {} + + value = row["value"] + unit = row["unit"] or "" + + # Add labels from attributes without duplicating matcher labels, if we don't do this, there will be a lot of duplicate label in the result. + for k, v in attributes.items(): + if k not in ["value", "unit"] and k not in matcher_label_names and k not in attr_label_names: + all_labels.append(MetricLabel(name=k, value=str(v))) + attr_label_names.add(k) + + # Determine timestamp + if query_type == MetricQueryType.RANGE and granularity: + try: + bucket_start_raw = row["bucket_start"] + except KeyError as e: + raise ValueError( + "DB did not have a bucket_start time in row when using granularity, this indicates improper formatting" + ) from e + # this value could also be there, but be NULL, I think. + if bucket_start_raw is None: + raise ValueError("bucket_start is None check time format and data") + bucket_start = datetime.fromisoformat(bucket_start_raw) + timestamp = int(bucket_start.timestamp()) + elif query_type == MetricQueryType.INSTANT: + timestamp = int(datetime.now(UTC).timestamp()) + else: + try: + timestamp_raw = row["timestamp"] + except KeyError as e: + raise ValueError( + "DB did not have a timestamp in row, this indicates improper formatting" + ) from e + # this value could also be there, but be NULL, I think. + if timestamp_raw is None: + raise ValueError("timestamp is None check time format and data") + timestamp_iso = datetime.fromisoformat(timestamp_raw) + timestamp = int(timestamp_iso.timestamp()) + + data_points.append( + MetricDataPoint( + timestamp=timestamp, + value=value, + unit=unit, + ) + ) + + metric_series = [MetricSeries(metric=metric_name, labels=all_labels, values=data_points)] + return QueryMetricsResponse(data=metric_series) + + def _get_time_format_for_granularity(self, granularity: str | None) -> str: + """Get the SQLite strftime format string for a given granularity. + Args: + granularity: Granularity string (e.g., "1m", "5m", "1h", "1d") + Returns: + SQLite strftime format string for the granularity + """ + if granularity is None: + raise ValueError("granularity cannot be None for this method - use separate logic for no aggregation") + + if granularity.endswith("d"): + return "%Y-%m-%d 00:00:00" + elif granularity.endswith("h"): + return "%Y-%m-%d %H:00:00" + elif granularity.endswith("m"): + return "%Y-%m-%d %H:%M:00" + else: + return "%Y-%m-%d %H:%M:00" # Default to most granular which will give us the most timestamps. + async def query_traces( self, attribute_filters: list[QueryCondition] | None = None, From 130bf7db83efe695753bba6867dc3ecba1b72413 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Fri, 8 Aug 2025 14:33:29 -0400 Subject: [PATCH 2/2] test: add integration tests for query_metrics the integration tests check if all of the metrics we currently support actual are queryable after inference requests this also tests thinks like aggregation, label filtering, etc Signed-off-by: Charlie Doern --- tests/integration/recordings/index.sqlite | Bin 53248 -> 57344 bytes .../recordings/responses/390f0c7dac96.json | 39 ++++ .../recordings/responses/4de6877d86fa.json | 56 +++++ .../recordings/responses/5db0c44c83a4.json | 56 +++++ .../recordings/responses/7bcb0f86c91b.json | 39 ++++ .../recordings/responses/c31a86ea6c58.json | 39 ++++ .../recordings/responses/d0ac68cbde69.json | 24 +- .../recordings/responses/f6857bcea729.json | 39 ++++ .../recordings/responses/f80b99430f7e.json | 39 ++++ .../telemetry/test_telemetry_metrics.py | 208 ++++++++++++++++++ 10 files changed, 537 insertions(+), 2 deletions(-) create mode 100644 tests/integration/recordings/responses/390f0c7dac96.json create mode 100644 tests/integration/recordings/responses/4de6877d86fa.json create mode 100644 tests/integration/recordings/responses/5db0c44c83a4.json create mode 100644 tests/integration/recordings/responses/7bcb0f86c91b.json create mode 100644 tests/integration/recordings/responses/c31a86ea6c58.json create mode 100644 tests/integration/recordings/responses/f6857bcea729.json create mode 100644 tests/integration/recordings/responses/f80b99430f7e.json create mode 100644 tests/integration/telemetry/test_telemetry_metrics.py diff --git a/tests/integration/recordings/index.sqlite b/tests/integration/recordings/index.sqlite index 6f73bb3a00053c8d4182c9412d59346c831d6f04..089f55273015360a90711a04778c28d89dff7a98 100644 GIT binary patch delta 2528 zcmcJRTZmOv7{~YC=YHKYqfB0EJf>KY+PYs(qcen!Nfa0qK|$`l*51YhHODfFEImXt z&J;9m=*e)vNC}Y{c3C=RzC;xG5U6y8keGb(A&kJM);V-$_LMKq5FF0K;{4Zdec$*0 zuXPSgrVgA>y|+4Z&a$lPL~X!6H@)jaw>RfrHVanOz2;tbuevwfEAG$k<#<)sS+{J~ z%VT}!yENJxUsr>;|CC;qO_z3*oZ^g~ zjrFcSQ`ygl(#?t!4Gs-PNAyGXr}0&NyS=@=NOZsi=UTAP6F_kmhQ1^o)Pe97kbW3Q z2s{=*6i_G;@Cf&z5-1Djc4r0eLnL5mLN16X-`SUA!X7-LJOpU zmK+#XpkxpT0GKbd$00G>hmiY<0zrTWMCRRiW_U*?zu9`lvc4>IN5zpDl~KoOz@_K1fm!;plO3c$D~1O-xsT4Tnk@VO5NWJD?-QptoD zYD`0&`J&wggZDuwn3B*Bu%c87&jSdU(;@*e1`33ddrEW6rD+hv9yjL?b`mNP<^*Tn zDLK}5XUWGo5YN84dPCOCx;OPgl$*q9r{Df4-Z}|v$4N~zSbXu^M!VcxIl3a>tlUQj za3T7&F&Ot9oh{dwr?*+z!`WiK0n>$>+SlwDVUg;_CE6tW3EuJdY3S;@}`5<>W zCsTJjlg znMr*#_4xYd5I}Sj5av^ZybeJf@MdXje}}hJlP2{o-td2Uw3!}zqs6<+)E6>AjU@t) zBgwGOIAMy0fd|Z}snTZR2&e#PqKRl(ObTX#5pc~R79L{>eYAFa-%ZNH2KV|LP5@s@ zf;~1PeS)5vq*e$9Nb^>GiZM*FMBMWO69Ye?kJf&?BAR+T zd)vZ}8YID%l{y&h>)L&f*fj4W+*~=fu07WQoL4^9Au^YZLdfHix6K_7PTs`fQQy(W z;^Uv3vD3|-(Fdc$J#R%vPM71~KRcL;XJ@WCy+xCmJF|x@_q4mmy|?;X^@HjYxmvXR z{KjJTQ2yKO5BWFq{nL%1Z2R6wOlVOP&djv$jb!IDVRdw*^U`=K;f$9o|xXmC$#RzO0k>01L&t-zY)c*(xeboIF>9`lxp)in3IhWJ??eTAMwN{TOZdgu`0oG(7QcVD5n|7gjbMh6)CT{gZhLq$X<=*syTUXPmM*rQilT3pcXT*Og*wY>YV! zk_--$*%uhIFgP$cY_?lqz{tdywy`jYaq^sP^EMYOp2A}$$vBOHKa1}QpB?WCULl?u z?q^)LxLi2baEfr$vA%GA8HOLp?A>skCrLXAQlu9N2owN8#- z$i2B^XBRVbahLGqmv=L`5sJ(<3+;W!w0YS=9Yzr@U?9u0r!(;H<|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "options": { + "temperature": 0.0 + }, + "stream": false + }, + "endpoint": "/api/generate", + "model": "llama3.2:3b-instruct-fp16" + }, + "response": { + "body": { + "__type__": "ollama._types.GenerateResponse", + "__data__": { + "model": "llama3.2:3b-instruct-fp16", + "created_at": "2025-08-11T15:51:18.170868Z", + "done": true, + "done_reason": "stop", + "total_duration": 5240614083, + "load_duration": 9823416, + "prompt_eval_count": 21, + "prompt_eval_duration": 21000000, + "eval_count": 310, + "eval_duration": 5209000000, + "response": "This is the start of a test. I'll provide some sample data and you can try to generate metrics based on it.\n\n**Data:**\n\nLet's say we have a dataset of user interactions with an e-commerce website. The data includes:\n\n| User ID | Product Name | Purchase Date | Quantity | Price |\n| --- | --- | --- | --- | --- |\n| 1 | iPhone 13 | 2022-01-01 | 2 | 999.99 |\n| 1 | MacBook Air | 2022-01-05 | 1 | 1299.99 |\n| 2 | Samsung TV | 2022-01-10 | 3 | 899.99 |\n| 3 | iPhone 13 | 2022-01-15 | 1 | 999.99 |\n| 4 | MacBook Pro | 2022-01-20 | 2 | 1799.99 |\n\n**Task:**\n\nYour task is to generate the following metrics based on this data:\n\n1. Average order value (AOV)\n2. Conversion rate\n3. Average revenue per user (ARPU)\n4. Customer lifetime value (CLV)\n\nPlease provide your answers in a format like this:\n\n| Metric | Value |\n| --- | --- |\n| AOV | 1234.56 |\n| Conversion Rate | 0.25 |\n| ARPU | 1000.00 |\n| CLV | 5000.00 |\n\nGo ahead and generate the metrics!", + "thinking": null, + "context": null + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/4de6877d86fa.json b/tests/integration/recordings/responses/4de6877d86fa.json new file mode 100644 index 000000000..c33176b04 --- /dev/null +++ b/tests/integration/recordings/responses/4de6877d86fa.json @@ -0,0 +1,56 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "llama3.2:3b", + "messages": [ + { + "role": "user", + "content": "OpenAI test 0" + } + ], + "stream": false + }, + "endpoint": "/v1/chat/completions", + "model": "llama3.2:3b" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-159", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "It seems like you're trying to access an internal system, but I don't have any specific information about OpenAI's internal testing environment or how to interact with it. However, I can tell you that OpenAI is a company that provides various AI-related products and services, including the popular language model called Llama.\n\nIf you're looking for information on how to use OpenAI's products or APIs, I'd be happy to help with that! Alternatively, if you have any specific questions about AI or machine learning, feel free to ask, and I'll do my best to provide a helpful response.", + "refusal": null, + "role": "assistant", + "annotations": null, + "audio": null, + "function_call": null, + "tool_calls": null + } + } + ], + "created": 1754927774, + "model": "llama3.2:3b", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": "fp_ollama", + "usage": { + "completion_tokens": 122, + "prompt_tokens": 30, + "total_tokens": 152, + "completion_tokens_details": null, + "prompt_tokens_details": null + } + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/5db0c44c83a4.json b/tests/integration/recordings/responses/5db0c44c83a4.json new file mode 100644 index 000000000..c1a66e56a --- /dev/null +++ b/tests/integration/recordings/responses/5db0c44c83a4.json @@ -0,0 +1,56 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/v1/v1/chat/completions", + "headers": {}, + "body": { + "model": "llama3.2:3b", + "messages": [ + { + "role": "user", + "content": "OpenAI test 1" + } + ], + "stream": false + }, + "endpoint": "/v1/chat/completions", + "model": "llama3.2:3b" + }, + "response": { + "body": { + "__type__": "openai.types.chat.chat_completion.ChatCompletion", + "__data__": { + "id": "chatcmpl-831", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "I'm ready to help. What would you like to do? Would you like to:\n\n1. Ask a question\n2. Engage in a conversation\n3. Test my language understanding\n4. Something else (please specify)", + "refusal": null, + "role": "assistant", + "annotations": null, + "audio": null, + "function_call": null, + "tool_calls": null + } + } + ], + "created": 1754927774, + "model": "llama3.2:3b", + "object": "chat.completion", + "service_tier": null, + "system_fingerprint": "fp_ollama", + "usage": { + "completion_tokens": 48, + "prompt_tokens": 30, + "total_tokens": 78, + "completion_tokens_details": null, + "prompt_tokens_details": null + } + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/7bcb0f86c91b.json b/tests/integration/recordings/responses/7bcb0f86c91b.json new file mode 100644 index 000000000..4c9a55153 --- /dev/null +++ b/tests/integration/recordings/responses/7bcb0f86c91b.json @@ -0,0 +1,39 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/api/generate", + "headers": {}, + "body": { + "model": "llama3.2:3b-instruct-fp16", + "raw": true, + "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "options": { + "temperature": 0.0 + }, + "stream": false + }, + "endpoint": "/api/generate", + "model": "llama3.2:3b-instruct-fp16" + }, + "response": { + "body": { + "__type__": "ollama._types.GenerateResponse", + "__data__": { + "model": "llama3.2:3b-instruct-fp16", + "created_at": "2025-08-11T15:51:12.918723Z", + "done": true, + "done_reason": "stop", + "total_duration": 8868987792, + "load_duration": 2793275292, + "prompt_eval_count": 21, + "prompt_eval_duration": 250000000, + "eval_count": 344, + "eval_duration": 5823000000, + "response": "Here are some common test metrics used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions or classifications out of total predictions made.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to the sum of true positives and false positives (incorrectly predicted instances).\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: The square root of the mean of the squared percentage differences between predicted and actual values.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well a model fits the data, with higher values indicating better fit.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Normalized Mean Squared Error (NMSE)**: Similar to MSE, but normalized by the mean of the actual values.\n\nThese metrics can be used for various types of data, including:\n\n* Regression problems (e.g., predicting continuous values)\n* Classification problems (e.g., predicting categorical labels)\n* Time series forecasting\n* Clustering and dimensionality reduction\n\nWhen choosing a metric, consider the specific problem you're trying to solve, the type of data, and the desired level of precision.", + "thinking": null, + "context": null + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/c31a86ea6c58.json b/tests/integration/recordings/responses/c31a86ea6c58.json new file mode 100644 index 000000000..b8d109ddd --- /dev/null +++ b/tests/integration/recordings/responses/c31a86ea6c58.json @@ -0,0 +1,39 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/api/generate", + "headers": {}, + "body": { + "model": "llama3.2:3b", + "raw": true, + "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "options": { + "temperature": 0.0 + }, + "stream": false + }, + "endpoint": "/api/generate", + "model": "llama3.2:3b" + }, + "response": { + "body": { + "__type__": "ollama._types.GenerateResponse", + "__data__": { + "model": "llama3.2:3b", + "created_at": "2025-08-11T15:56:06.703788Z", + "done": true, + "done_reason": "stop", + "total_duration": 2722294000, + "load_duration": 9736083, + "prompt_eval_count": 21, + "prompt_eval_duration": 113000000, + "eval_count": 324, + "eval_duration": 2598000000, + "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions made by the model.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to total positive predictions.\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses the error as a percentage.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2, MSLE\n* Time series forecasting: MAPE, RMSPE\n\nNote that the choice of metric depends on the specific problem and data.", + "thinking": null, + "context": null + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/d0ac68cbde69.json b/tests/integration/recordings/responses/d0ac68cbde69.json index e9a939aa0..f99c7ca38 100644 --- a/tests/integration/recordings/responses/d0ac68cbde69.json +++ b/tests/integration/recordings/responses/d0ac68cbde69.json @@ -16,7 +16,7 @@ "model": "llama3.2:3b", "name": "llama3.2:3b", "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72", - "expires_at": "2025-08-06T15:57:21.573326-04:00", + "expires_at": "2025-08-11T11:58:17.058489-04:00", "size": 4030033920, "size_vram": 4030033920, "details": { @@ -28,7 +28,27 @@ ], "parameter_size": "3.2B", "quantization_level": "Q4_K_M" - } + }, + "context_length": null + }, + { + "model": "llama3.2:3b-instruct-fp16", + "name": "llama3.2:3b-instruct-fp16", + "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d", + "expires_at": "2025-08-11T11:56:22.727361-04:00", + "size": 8581748736, + "size_vram": 8581748736, + "details": { + "parent_model": "", + "format": "gguf", + "family": "llama", + "families": [ + "llama" + ], + "parameter_size": "3.2B", + "quantization_level": "F16" + }, + "context_length": null } ] } diff --git a/tests/integration/recordings/responses/f6857bcea729.json b/tests/integration/recordings/responses/f6857bcea729.json new file mode 100644 index 000000000..404bfb987 --- /dev/null +++ b/tests/integration/recordings/responses/f6857bcea729.json @@ -0,0 +1,39 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/api/generate", + "headers": {}, + "body": { + "model": "llama3.2:3b", + "raw": true, + "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 2<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "options": { + "temperature": 0.0 + }, + "stream": false + }, + "endpoint": "/api/generate", + "model": "llama3.2:3b" + }, + "response": { + "body": { + "__type__": "ollama._types.GenerateResponse", + "__data__": { + "model": "llama3.2:3b", + "created_at": "2025-08-11T15:56:13.082679Z", + "done": true, + "done_reason": "stop", + "total_duration": 2606245291, + "load_duration": 9979708, + "prompt_eval_count": 21, + "prompt_eval_duration": 23000000, + "eval_count": 321, + "eval_duration": 2572000000, + "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: Measures how close the predicted values are to the actual values.\n2. **Precision**: Measures the proportion of true positives among all positive predictions made by the model.\n3. **Recall**: Measures the proportion of true positives among all actual positive instances.\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses errors as a percentage of the actual value.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: Measures the average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2\n* Time series forecasting: MAPE, MSLE\n\nNote that the choice of metric depends on the specific problem and data.", + "thinking": null, + "context": null + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/recordings/responses/f80b99430f7e.json b/tests/integration/recordings/responses/f80b99430f7e.json new file mode 100644 index 000000000..5b692f4ca --- /dev/null +++ b/tests/integration/recordings/responses/f80b99430f7e.json @@ -0,0 +1,39 @@ +{ + "request": { + "method": "POST", + "url": "http://localhost:11434/api/generate", + "headers": {}, + "body": { + "model": "llama3.2:3b", + "raw": true, + "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + "options": { + "temperature": 0.0 + }, + "stream": false + }, + "endpoint": "/api/generate", + "model": "llama3.2:3b" + }, + "response": { + "body": { + "__type__": "ollama._types.GenerateResponse", + "__data__": { + "model": "llama3.2:3b", + "created_at": "2025-08-11T15:56:10.465932Z", + "done": true, + "done_reason": "stop", + "total_duration": 3745686709, + "load_duration": 9734584, + "prompt_eval_count": 21, + "prompt_eval_duration": 23000000, + "eval_count": 457, + "eval_duration": 3712000000, + "response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n**Primary Metrics**\n\n1. **Response Time**: The time it takes for the system to respond to a request.\n2. **Throughput**: The number of requests processed by the system per unit time (e.g., requests per second).\n3. **Error Rate**: The percentage of requests that result in an error.\n\n**Secondary Metrics**\n\n1. **Average Response Time**: The average response time for all requests.\n2. **Median Response Time**: The middle value of the response times, used to detect outliers.\n3. **99th Percentile Response Time**: The response time at which 99% of requests are completed within this time.\n4. **Request Latency**: The difference between the request arrival time and the response time.\n\n**User Experience Metrics**\n\n1. **User Satisfaction (USAT)**: Measured through surveys or feedback forms to gauge user satisfaction with the system's performance.\n2. **First Response Time**: The time it takes for a user to receive their first response from the system.\n3. **Time Spent in System**: The total amount of time a user spends interacting with the system.\n\n**System Resource Metrics**\n\n1. **CPU Utilization**: The percentage of CPU resources being used by the system.\n2. **Memory Usage**: The amount of memory being used by the system.\n3. **Disk I/O Wait Time**: The average time spent waiting for disk I/O operations to complete.\n\n**Security Metrics**\n\n1. **Authentication Success Rate**: The percentage of successful authentication attempts.\n2. **Authorization Success Rate**: The percentage of successful authorization attempts.\n3. **Error Rate (Security)**: The percentage of security-related errors.\n\n**Other Metrics**\n\n1. **Page Load Time**: The time it takes for a page to load.\n2. **Click-Through Rate (CTR)**: The percentage of users who click on a link or button after seeing an ad or notification.\n3. **Conversion Rate**: The percentage of users who complete a desired action (e.g., fill out a form, make a purchase).\n\nThese metrics can be used to evaluate the performance and effectiveness of various aspects of your system, from user experience to security and resource utilization.", + "thinking": null, + "context": null + } + }, + "is_streaming": false + } +} diff --git a/tests/integration/telemetry/test_telemetry_metrics.py b/tests/integration/telemetry/test_telemetry_metrics.py new file mode 100644 index 000000000..5e3b42ea2 --- /dev/null +++ b/tests/integration/telemetry/test_telemetry_metrics.py @@ -0,0 +1,208 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import time +from datetime import UTC, datetime, timedelta + +import pytest + + +@pytest.fixture(scope="module", autouse=True) +def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id): + """Setup fixture that creates telemetry metrics data before tests run.""" + + # Skip OpenAI tests if running in library mode + if not hasattr(client_with_models, "base_url"): + pytest.skip("OpenAI client tests not supported with library client") + + prompt_tokens = [] + completion_tokens = [] + total_tokens = [] + for i in range(3): + response = client_with_models.inference.chat_completion( + model_id=text_model_id, + messages=[{"role": "user", "content": f"Test metrics generation {i}"}], + stream=False, # Ensure metrics are captured + ) + + # Create OpenAI completions to generate metrics using the proper OpenAI client + for i in range(2): + response = openai_client.chat.completions.create( + model=text_model_id, + messages=[{"role": "user", "content": f"OpenAI test {i}"}], + stream=False, + ) + prompt_tokens.append(response.usage.prompt_tokens) + completion_tokens.append(response.usage.completion_tokens) + total_tokens.append(response.usage.total_tokens) + + # Wait for metrics to be logged + start_time = time.time() + while time.time() - start_time < 30: + try: + # Try to query metrics to see if they're available + metrics_response = client_with_models.telemetry.query_metrics( + metric_name="completion_tokens", + start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()), + ) + if len(metrics_response[0].values) > 0: + break + except Exception: + pass + time.sleep(1) + + # Wait additional time to ensure all metrics are processed + time.sleep(5) + + # Return the token lists for use in tests + return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens} + + +def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data): + """Test that prompt_tokens metrics are queryable.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + response = client_with_models.telemetry.query_metrics( + metric_name="prompt_tokens", + start_time=start_time, + ) + + assert isinstance(response, list) + + assert isinstance(response[0].values, list), "Should return a list of metric series" + + assert response[0].metric == "prompt_tokens" + + # Use the actual values from setup instead of hardcoded values + expected_values = setup_telemetry_metrics_data["prompt_tokens"] + assert response[0].values[-1].value in expected_values, ( + f"Expected one of {expected_values}, got {response[0].values[-1].value}" + ) + + +def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data): + """Test that completion_tokens metrics are queryable.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + response = client_with_models.telemetry.query_metrics( + metric_name="completion_tokens", + start_time=start_time, + ) + + assert isinstance(response, list) + + assert isinstance(response[0].values, list), "Should return a list of metric series" + + assert response[0].metric == "completion_tokens" + + # Use the actual values from setup instead of hardcoded values + expected_values = setup_telemetry_metrics_data["completion_tokens"] + assert response[0].values[-1].value in expected_values, ( + f"Expected one of {expected_values}, got {response[0].values[-1].value}" + ) + + +def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data): + """Test that total_tokens metrics are queryable.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + response = client_with_models.telemetry.query_metrics( + metric_name="total_tokens", + start_time=start_time, + ) + + assert isinstance(response, list) + + assert isinstance(response[0].values, list), "Should return a list of metric series" + + assert response[0].metric == "total_tokens" + + # Use the actual values from setup instead of hardcoded values + expected_values = setup_telemetry_metrics_data["total_tokens"] + assert response[0].values[-1].value in expected_values, ( + f"Expected one of {expected_values}, got {response[0].values[-1].value}" + ) + + +def test_query_metrics_with_time_range(llama_stack_client, text_model_id): + """Test that metrics are queryable with time range.""" + end_time = int(datetime.now(UTC).timestamp()) + start_time = end_time - 600 # 10 minutes ago + + response = llama_stack_client.telemetry.query_metrics( + metric_name="prompt_tokens", + start_time=start_time, + end_time=end_time, + ) + + assert isinstance(response, list) + + assert isinstance(response[0].values, list), "Should return a list of metric series" + + assert response[0].metric == "prompt_tokens" + + +def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id): + """Test that metrics are queryable with label matchers.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + response = llama_stack_client.telemetry.query_metrics( + metric_name="prompt_tokens", + start_time=start_time, + label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}], + ) + + assert isinstance(response[0].values, list), "Should return a list of metric series" + + +def test_query_metrics_nonexistent_metric(llama_stack_client): + """Test that querying a nonexistent metric returns empty data.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + response = llama_stack_client.telemetry.query_metrics( + metric_name="nonexistent_metric", + start_time=start_time, + ) + + assert isinstance(response, list), "Should return an empty list for nonexistent metric" + assert len(response) == 0 + + +def test_query_metrics_with_granularity(llama_stack_client, text_model_id): + """Test that metrics are queryable with different granularity levels.""" + start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp()) + + # Test hourly granularity + hourly_response = llama_stack_client.telemetry.query_metrics( + metric_name="total_tokens", + start_time=start_time, + granularity="1h", + ) + + # Test daily granularity + daily_response = llama_stack_client.telemetry.query_metrics( + metric_name="total_tokens", + start_time=start_time, + granularity="1d", + ) + + # Test no granularity (raw data points) + raw_response = llama_stack_client.telemetry.query_metrics( + metric_name="total_tokens", + start_time=start_time, + granularity=None, + ) + + # All should return valid data + assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data" + assert isinstance(daily_response[0].values, list), "Daily granularity should return data" + assert isinstance(raw_response[0].values, list), "No granularity should return data" + + # Verify that different granularities produce different aggregation levels + # (The exact number depends on data distribution, but they should be queryable) + assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable" + assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable" + assert len(raw_response[0].values) >= 0, "No granularity should be queryable"