From fe5f5e530c194d98cf5a40537ee6efa9121e60c1 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 7 May 2025 10:11:26 -0700 Subject: [PATCH] feat: add metrics query API (#1394) # What does this PR do? Adds the API to query metrics from telemetry. ## Test Plan llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml --------- Co-authored-by: Ashwin Bharambe --- docs/_static/llama-stack-spec.html | 189 ++++++++++++++++++ docs/_static/llama-stack-spec.yaml | 132 ++++++++++++ llama_stack/apis/telemetry/telemetry.py | 52 +++++ .../telemetry/meta_reference/telemetry.py | 14 ++ 4 files changed, 387 insertions(+) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 163d05087..4020dc4cd 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -3475,6 +3475,58 @@ } } }, + "/v1/telemetry/metrics/{metric_name}": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Telemetry" + ], + "description": "", + "parameters": [ + { + "name": "metric_name", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryMetricsRequest" + } + } + }, + "required": true + } + } + }, "/v1/telemetry/spans": { "post": { "responses": { @@ -11270,6 +11322,143 @@ ], "title": "QueryChunksResponse" }, + "QueryMetricsRequest": { + "type": "object", + "properties": { + "start_time": { + "type": "integer" + }, + "end_time": { + "type": "integer" + }, + "granularity": { + "type": "string" + }, + "query_type": { + "type": "string", + "enum": [ + "range", + "instant" + ], + "title": "MetricQueryType" + }, + "label_matchers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + }, + "operator": { + "type": "string", + "enum": [ + "=", + "!=", + "=~", + "!~" + ], + "title": "MetricLabelOperator", + "default": "=" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value", + "operator" + ], + "title": "MetricLabelMatcher" + } + } + }, + "additionalProperties": false, + "required": [ + "start_time", + "query_type" + ], + "title": "QueryMetricsRequest" + }, + "MetricDataPoint": { + "type": "object", + "properties": { + "timestamp": { + "type": "integer" + }, + "value": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "timestamp", + "value" + ], + "title": "MetricDataPoint" + }, + "MetricLabel": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "name", + "value" + ], + "title": "MetricLabel" + }, + "MetricSeries": { + "type": "object", + "properties": { + "metric": { + "type": "string" + }, + "labels": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricLabel" + } + }, + "values": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricDataPoint" + } + } + }, + "additionalProperties": false, + "required": [ + "metric", + "labels", + "values" + ], + "title": "MetricSeries" + }, + "QueryMetricsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricSeries" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "QueryMetricsResponse" + }, "QueryCondition": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 67c1d3dbd..62e3ca85c 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2397,6 +2397,40 @@ paths: schema: $ref: '#/components/schemas/QueryChunksRequest' required: true + /v1/telemetry/metrics/{metric_name}: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Telemetry + description: '' + parameters: + - name: metric_name + in: path + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/QueryMetricsRequest' + required: true /v1/telemetry/spans: post: responses: @@ -7762,6 +7796,104 @@ components: - chunks - scores title: QueryChunksResponse + QueryMetricsRequest: + type: object + properties: + start_time: + type: integer + end_time: + type: integer + granularity: + type: string + query_type: + type: string + enum: + - range + - instant + title: MetricQueryType + label_matchers: + type: array + items: + type: object + properties: + name: + type: string + value: + type: string + operator: + type: string + enum: + - '=' + - '!=' + - =~ + - '!~' + title: MetricLabelOperator + default: '=' + additionalProperties: false + required: + - name + - value + - operator + title: MetricLabelMatcher + additionalProperties: false + required: + - start_time + - query_type + title: QueryMetricsRequest + MetricDataPoint: + type: object + properties: + timestamp: + type: integer + value: + type: number + additionalProperties: false + required: + - timestamp + - value + title: MetricDataPoint + MetricLabel: + type: object + properties: + name: + type: string + value: + type: string + additionalProperties: false + required: + - name + - value + title: MetricLabel + MetricSeries: + type: object + properties: + metric: + type: string + labels: + type: array + items: + $ref: '#/components/schemas/MetricLabel' + values: + type: array + items: + $ref: '#/components/schemas/MetricDataPoint' + additionalProperties: false + required: + - metric + - labels + - values + title: MetricSeries + QueryMetricsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/MetricSeries' + additionalProperties: false + required: + - data + title: QueryMetricsResponse QueryCondition: type: object properties: diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 34e296fef..0a3e63a88 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -203,6 +203,47 @@ class QuerySpanTreeResponse(BaseModel): data: dict[str, SpanWithStatus] +class MetricQueryType(Enum): + RANGE = "range" + INSTANT = "instant" + + +class MetricLabelOperator(Enum): + EQUALS = "=" + NOT_EQUALS = "!=" + REGEX_MATCH = "=~" + REGEX_NOT_MATCH = "!~" + + +class MetricLabelMatcher(BaseModel): + name: str + value: str + operator: MetricLabelOperator = MetricLabelOperator.EQUALS + + +@json_schema_type +class MetricLabel(BaseModel): + name: str + value: str + + +@json_schema_type +class MetricDataPoint(BaseModel): + timestamp: int + value: float + + +@json_schema_type +class MetricSeries(BaseModel): + metric: str + labels: list[MetricLabel] + values: list[MetricDataPoint] + + +class QueryMetricsResponse(BaseModel): + data: list[MetricSeries] + + @runtime_checkable class Telemetry(Protocol): @webmethod(route="/telemetry/events", method="POST") @@ -247,3 +288,14 @@ class Telemetry(Protocol): dataset_id: str, max_depth: int | None = None, ) -> None: ... + + @webmethod(route="/telemetry/metrics/{metric_name}", method="POST") + async def query_metrics( + self, + metric_name: str, + start_time: int, + end_time: int | None = None, + granularity: str | None = "1d", + query_type: MetricQueryType = MetricQueryType.RANGE, + label_matchers: list[MetricLabelMatcher] | None = None, + ) -> QueryMetricsResponse: ... diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index 9295d5cab..67362dd36 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -20,7 +20,10 @@ from opentelemetry.semconv.resource import ResourceAttributes from llama_stack.apis.telemetry import ( Event, MetricEvent, + MetricLabelMatcher, + MetricQueryType, QueryCondition, + QueryMetricsResponse, QuerySpanTreeResponse, QueryTracesResponse, Span, @@ -123,6 +126,17 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): else: raise ValueError(f"Unknown event type: {event}") + async def query_metrics( + self, + metric_name: str, + start_time: int, + end_time: int | None = None, + granularity: str | None = "1d", + query_type: MetricQueryType = MetricQueryType.RANGE, + label_matchers: list[MetricLabelMatcher] | None = None, + ) -> QueryMetricsResponse: + raise NotImplementedError("Querying metrics is not implemented") + def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None: with self._lock: # Use global storage instead of instance storage