mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-15 06:00:48 +00:00
test: add integration tests for query_metrics
the integration tests check if all of the metrics we currently support actual are queryable after inference requests this also tests thinks like aggregation, label filtering, etc Signed-off-by: Charlie Doern <cdoern@redhat.com>
This commit is contained in:
parent
c1a9a21380
commit
130bf7db83
10 changed files with 537 additions and 2 deletions
Binary file not shown.
39
tests/integration/recordings/responses/390f0c7dac96.json
Normal file
39
tests/integration/recordings/responses/390f0c7dac96.json
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/api/generate",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b-instruct-fp16",
|
||||||
|
"raw": true,
|
||||||
|
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/api/generate",
|
||||||
|
"model": "llama3.2:3b-instruct-fp16"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "ollama._types.GenerateResponse",
|
||||||
|
"__data__": {
|
||||||
|
"model": "llama3.2:3b-instruct-fp16",
|
||||||
|
"created_at": "2025-08-11T15:51:18.170868Z",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 5240614083,
|
||||||
|
"load_duration": 9823416,
|
||||||
|
"prompt_eval_count": 21,
|
||||||
|
"prompt_eval_duration": 21000000,
|
||||||
|
"eval_count": 310,
|
||||||
|
"eval_duration": 5209000000,
|
||||||
|
"response": "This is the start of a test. I'll provide some sample data and you can try to generate metrics based on it.\n\n**Data:**\n\nLet's say we have a dataset of user interactions with an e-commerce website. The data includes:\n\n| User ID | Product Name | Purchase Date | Quantity | Price |\n| --- | --- | --- | --- | --- |\n| 1 | iPhone 13 | 2022-01-01 | 2 | 999.99 |\n| 1 | MacBook Air | 2022-01-05 | 1 | 1299.99 |\n| 2 | Samsung TV | 2022-01-10 | 3 | 899.99 |\n| 3 | iPhone 13 | 2022-01-15 | 1 | 999.99 |\n| 4 | MacBook Pro | 2022-01-20 | 2 | 1799.99 |\n\n**Task:**\n\nYour task is to generate the following metrics based on this data:\n\n1. Average order value (AOV)\n2. Conversion rate\n3. Average revenue per user (ARPU)\n4. Customer lifetime value (CLV)\n\nPlease provide your answers in a format like this:\n\n| Metric | Value |\n| --- | --- |\n| AOV | 1234.56 |\n| Conversion Rate | 0.25 |\n| ARPU | 1000.00 |\n| CLV | 5000.00 |\n\nGo ahead and generate the metrics!",
|
||||||
|
"thinking": null,
|
||||||
|
"context": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
56
tests/integration/recordings/responses/4de6877d86fa.json
Normal file
56
tests/integration/recordings/responses/4de6877d86fa.json
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/v1/v1/chat/completions",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "OpenAI test 0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/v1/chat/completions",
|
||||||
|
"model": "llama3.2:3b"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
|
||||||
|
"__data__": {
|
||||||
|
"id": "chatcmpl-159",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "It seems like you're trying to access an internal system, but I don't have any specific information about OpenAI's internal testing environment or how to interact with it. However, I can tell you that OpenAI is a company that provides various AI-related products and services, including the popular language model called Llama.\n\nIf you're looking for information on how to use OpenAI's products or APIs, I'd be happy to help with that! Alternatively, if you have any specific questions about AI or machine learning, feel free to ask, and I'll do my best to provide a helpful response.",
|
||||||
|
"refusal": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"annotations": null,
|
||||||
|
"audio": null,
|
||||||
|
"function_call": null,
|
||||||
|
"tool_calls": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1754927774,
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"service_tier": null,
|
||||||
|
"system_fingerprint": "fp_ollama",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 122,
|
||||||
|
"prompt_tokens": 30,
|
||||||
|
"total_tokens": 152,
|
||||||
|
"completion_tokens_details": null,
|
||||||
|
"prompt_tokens_details": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
56
tests/integration/recordings/responses/5db0c44c83a4.json
Normal file
56
tests/integration/recordings/responses/5db0c44c83a4.json
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/v1/v1/chat/completions",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "OpenAI test 1"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/v1/chat/completions",
|
||||||
|
"model": "llama3.2:3b"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
|
||||||
|
"__data__": {
|
||||||
|
"id": "chatcmpl-831",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "I'm ready to help. What would you like to do? Would you like to:\n\n1. Ask a question\n2. Engage in a conversation\n3. Test my language understanding\n4. Something else (please specify)",
|
||||||
|
"refusal": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"annotations": null,
|
||||||
|
"audio": null,
|
||||||
|
"function_call": null,
|
||||||
|
"tool_calls": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1754927774,
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"service_tier": null,
|
||||||
|
"system_fingerprint": "fp_ollama",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 48,
|
||||||
|
"prompt_tokens": 30,
|
||||||
|
"total_tokens": 78,
|
||||||
|
"completion_tokens_details": null,
|
||||||
|
"prompt_tokens_details": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
39
tests/integration/recordings/responses/7bcb0f86c91b.json
Normal file
39
tests/integration/recordings/responses/7bcb0f86c91b.json
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/api/generate",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b-instruct-fp16",
|
||||||
|
"raw": true,
|
||||||
|
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/api/generate",
|
||||||
|
"model": "llama3.2:3b-instruct-fp16"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "ollama._types.GenerateResponse",
|
||||||
|
"__data__": {
|
||||||
|
"model": "llama3.2:3b-instruct-fp16",
|
||||||
|
"created_at": "2025-08-11T15:51:12.918723Z",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 8868987792,
|
||||||
|
"load_duration": 2793275292,
|
||||||
|
"prompt_eval_count": 21,
|
||||||
|
"prompt_eval_duration": 250000000,
|
||||||
|
"eval_count": 344,
|
||||||
|
"eval_duration": 5823000000,
|
||||||
|
"response": "Here are some common test metrics used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions or classifications out of total predictions made.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to the sum of true positives and false positives (incorrectly predicted instances).\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: The square root of the mean of the squared percentage differences between predicted and actual values.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well a model fits the data, with higher values indicating better fit.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Normalized Mean Squared Error (NMSE)**: Similar to MSE, but normalized by the mean of the actual values.\n\nThese metrics can be used for various types of data, including:\n\n* Regression problems (e.g., predicting continuous values)\n* Classification problems (e.g., predicting categorical labels)\n* Time series forecasting\n* Clustering and dimensionality reduction\n\nWhen choosing a metric, consider the specific problem you're trying to solve, the type of data, and the desired level of precision.",
|
||||||
|
"thinking": null,
|
||||||
|
"context": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
39
tests/integration/recordings/responses/c31a86ea6c58.json
Normal file
39
tests/integration/recordings/responses/c31a86ea6c58.json
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/api/generate",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"raw": true,
|
||||||
|
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/api/generate",
|
||||||
|
"model": "llama3.2:3b"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "ollama._types.GenerateResponse",
|
||||||
|
"__data__": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"created_at": "2025-08-11T15:56:06.703788Z",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 2722294000,
|
||||||
|
"load_duration": 9736083,
|
||||||
|
"prompt_eval_count": 21,
|
||||||
|
"prompt_eval_duration": 113000000,
|
||||||
|
"eval_count": 324,
|
||||||
|
"eval_duration": 2598000000,
|
||||||
|
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions made by the model.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to total positive predictions.\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses the error as a percentage.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2, MSLE\n* Time series forecasting: MAPE, RMSPE\n\nNote that the choice of metric depends on the specific problem and data.",
|
||||||
|
"thinking": null,
|
||||||
|
"context": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,7 +16,7 @@
|
||||||
"model": "llama3.2:3b",
|
"model": "llama3.2:3b",
|
||||||
"name": "llama3.2:3b",
|
"name": "llama3.2:3b",
|
||||||
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
|
||||||
"expires_at": "2025-08-06T15:57:21.573326-04:00",
|
"expires_at": "2025-08-11T11:58:17.058489-04:00",
|
||||||
"size": 4030033920,
|
"size": 4030033920,
|
||||||
"size_vram": 4030033920,
|
"size_vram": 4030033920,
|
||||||
"details": {
|
"details": {
|
||||||
|
@ -28,7 +28,27 @@
|
||||||
],
|
],
|
||||||
"parameter_size": "3.2B",
|
"parameter_size": "3.2B",
|
||||||
"quantization_level": "Q4_K_M"
|
"quantization_level": "Q4_K_M"
|
||||||
}
|
},
|
||||||
|
"context_length": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "llama3.2:3b-instruct-fp16",
|
||||||
|
"name": "llama3.2:3b-instruct-fp16",
|
||||||
|
"digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
|
||||||
|
"expires_at": "2025-08-11T11:56:22.727361-04:00",
|
||||||
|
"size": 8581748736,
|
||||||
|
"size_vram": 8581748736,
|
||||||
|
"details": {
|
||||||
|
"parent_model": "",
|
||||||
|
"format": "gguf",
|
||||||
|
"family": "llama",
|
||||||
|
"families": [
|
||||||
|
"llama"
|
||||||
|
],
|
||||||
|
"parameter_size": "3.2B",
|
||||||
|
"quantization_level": "F16"
|
||||||
|
},
|
||||||
|
"context_length": null
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
39
tests/integration/recordings/responses/f6857bcea729.json
Normal file
39
tests/integration/recordings/responses/f6857bcea729.json
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/api/generate",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"raw": true,
|
||||||
|
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 2<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/api/generate",
|
||||||
|
"model": "llama3.2:3b"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "ollama._types.GenerateResponse",
|
||||||
|
"__data__": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"created_at": "2025-08-11T15:56:13.082679Z",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 2606245291,
|
||||||
|
"load_duration": 9979708,
|
||||||
|
"prompt_eval_count": 21,
|
||||||
|
"prompt_eval_duration": 23000000,
|
||||||
|
"eval_count": 321,
|
||||||
|
"eval_duration": 2572000000,
|
||||||
|
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n1. **Accuracy**: Measures how close the predicted values are to the actual values.\n2. **Precision**: Measures the proportion of true positives among all positive predictions made by the model.\n3. **Recall**: Measures the proportion of true positives among all actual positive instances.\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: Measures the average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: A variation of MSE that expresses errors as a percentage of the actual value.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well the model explains the variance in the data.\n9. **Mean Absolute Percentage Error (MAPE)**: Measures the average absolute percentage difference between predicted and actual values.\n10. **Mean Squared Logarithmic Error (MSLE)**: A variation of MSE that is more suitable for skewed distributions.\n\nThese metrics can be used to evaluate different aspects of a system's performance, such as:\n\n* Classification models: accuracy, precision, recall, F1-score\n* Regression models: MSE, MAE, RMSPE, R2\n* Time series forecasting: MAPE, MSLE\n\nNote that the choice of metric depends on the specific problem and data.",
|
||||||
|
"thinking": null,
|
||||||
|
"context": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
39
tests/integration/recordings/responses/f80b99430f7e.json
Normal file
39
tests/integration/recordings/responses/f80b99430f7e.json
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "http://localhost:11434/api/generate",
|
||||||
|
"headers": {},
|
||||||
|
"body": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"raw": true,
|
||||||
|
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||||
|
"options": {
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
"stream": false
|
||||||
|
},
|
||||||
|
"endpoint": "/api/generate",
|
||||||
|
"model": "llama3.2:3b"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"body": {
|
||||||
|
"__type__": "ollama._types.GenerateResponse",
|
||||||
|
"__data__": {
|
||||||
|
"model": "llama3.2:3b",
|
||||||
|
"created_at": "2025-08-11T15:56:10.465932Z",
|
||||||
|
"done": true,
|
||||||
|
"done_reason": "stop",
|
||||||
|
"total_duration": 3745686709,
|
||||||
|
"load_duration": 9734584,
|
||||||
|
"prompt_eval_count": 21,
|
||||||
|
"prompt_eval_duration": 23000000,
|
||||||
|
"eval_count": 457,
|
||||||
|
"eval_duration": 3712000000,
|
||||||
|
"response": "Here are some test metrics that can be used to evaluate the performance of a system:\n\n**Primary Metrics**\n\n1. **Response Time**: The time it takes for the system to respond to a request.\n2. **Throughput**: The number of requests processed by the system per unit time (e.g., requests per second).\n3. **Error Rate**: The percentage of requests that result in an error.\n\n**Secondary Metrics**\n\n1. **Average Response Time**: The average response time for all requests.\n2. **Median Response Time**: The middle value of the response times, used to detect outliers.\n3. **99th Percentile Response Time**: The response time at which 99% of requests are completed within this time.\n4. **Request Latency**: The difference between the request arrival time and the response time.\n\n**User Experience Metrics**\n\n1. **User Satisfaction (USAT)**: Measured through surveys or feedback forms to gauge user satisfaction with the system's performance.\n2. **First Response Time**: The time it takes for a user to receive their first response from the system.\n3. **Time Spent in System**: The total amount of time a user spends interacting with the system.\n\n**System Resource Metrics**\n\n1. **CPU Utilization**: The percentage of CPU resources being used by the system.\n2. **Memory Usage**: The amount of memory being used by the system.\n3. **Disk I/O Wait Time**: The average time spent waiting for disk I/O operations to complete.\n\n**Security Metrics**\n\n1. **Authentication Success Rate**: The percentage of successful authentication attempts.\n2. **Authorization Success Rate**: The percentage of successful authorization attempts.\n3. **Error Rate (Security)**: The percentage of security-related errors.\n\n**Other Metrics**\n\n1. **Page Load Time**: The time it takes for a page to load.\n2. **Click-Through Rate (CTR)**: The percentage of users who click on a link or button after seeing an ad or notification.\n3. **Conversion Rate**: The percentage of users who complete a desired action (e.g., fill out a form, make a purchase).\n\nThese metrics can be used to evaluate the performance and effectiveness of various aspects of your system, from user experience to security and resource utilization.",
|
||||||
|
"thinking": null,
|
||||||
|
"context": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"is_streaming": false
|
||||||
|
}
|
||||||
|
}
|
208
tests/integration/telemetry/test_telemetry_metrics.py
Normal file
208
tests/integration/telemetry/test_telemetry_metrics.py
Normal file
|
@ -0,0 +1,208 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import time
|
||||||
|
from datetime import UTC, datetime, timedelta
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", autouse=True)
|
||||||
|
def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_id):
|
||||||
|
"""Setup fixture that creates telemetry metrics data before tests run."""
|
||||||
|
|
||||||
|
# Skip OpenAI tests if running in library mode
|
||||||
|
if not hasattr(client_with_models, "base_url"):
|
||||||
|
pytest.skip("OpenAI client tests not supported with library client")
|
||||||
|
|
||||||
|
prompt_tokens = []
|
||||||
|
completion_tokens = []
|
||||||
|
total_tokens = []
|
||||||
|
for i in range(3):
|
||||||
|
response = client_with_models.inference.chat_completion(
|
||||||
|
model_id=text_model_id,
|
||||||
|
messages=[{"role": "user", "content": f"Test metrics generation {i}"}],
|
||||||
|
stream=False, # Ensure metrics are captured
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create OpenAI completions to generate metrics using the proper OpenAI client
|
||||||
|
for i in range(2):
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
messages=[{"role": "user", "content": f"OpenAI test {i}"}],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
prompt_tokens.append(response.usage.prompt_tokens)
|
||||||
|
completion_tokens.append(response.usage.completion_tokens)
|
||||||
|
total_tokens.append(response.usage.total_tokens)
|
||||||
|
|
||||||
|
# Wait for metrics to be logged
|
||||||
|
start_time = time.time()
|
||||||
|
while time.time() - start_time < 30:
|
||||||
|
try:
|
||||||
|
# Try to query metrics to see if they're available
|
||||||
|
metrics_response = client_with_models.telemetry.query_metrics(
|
||||||
|
metric_name="completion_tokens",
|
||||||
|
start_time=int((datetime.now(UTC) - timedelta(minutes=5)).timestamp()),
|
||||||
|
)
|
||||||
|
if len(metrics_response[0].values) > 0:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Wait additional time to ensure all metrics are processed
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Return the token lists for use in tests
|
||||||
|
return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_prompt_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
||||||
|
"""Test that prompt_tokens metrics are queryable."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
response = client_with_models.telemetry.query_metrics(
|
||||||
|
metric_name="prompt_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response, list)
|
||||||
|
|
||||||
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
||||||
|
|
||||||
|
assert response[0].metric == "prompt_tokens"
|
||||||
|
|
||||||
|
# Use the actual values from setup instead of hardcoded values
|
||||||
|
expected_values = setup_telemetry_metrics_data["prompt_tokens"]
|
||||||
|
assert response[0].values[-1].value in expected_values, (
|
||||||
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_completion_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
||||||
|
"""Test that completion_tokens metrics are queryable."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
response = client_with_models.telemetry.query_metrics(
|
||||||
|
metric_name="completion_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response, list)
|
||||||
|
|
||||||
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
||||||
|
|
||||||
|
assert response[0].metric == "completion_tokens"
|
||||||
|
|
||||||
|
# Use the actual values from setup instead of hardcoded values
|
||||||
|
expected_values = setup_telemetry_metrics_data["completion_tokens"]
|
||||||
|
assert response[0].values[-1].value in expected_values, (
|
||||||
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_total_tokens(client_with_models, text_model_id, setup_telemetry_metrics_data):
|
||||||
|
"""Test that total_tokens metrics are queryable."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
response = client_with_models.telemetry.query_metrics(
|
||||||
|
metric_name="total_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response, list)
|
||||||
|
|
||||||
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
||||||
|
|
||||||
|
assert response[0].metric == "total_tokens"
|
||||||
|
|
||||||
|
# Use the actual values from setup instead of hardcoded values
|
||||||
|
expected_values = setup_telemetry_metrics_data["total_tokens"]
|
||||||
|
assert response[0].values[-1].value in expected_values, (
|
||||||
|
f"Expected one of {expected_values}, got {response[0].values[-1].value}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_with_time_range(llama_stack_client, text_model_id):
|
||||||
|
"""Test that metrics are queryable with time range."""
|
||||||
|
end_time = int(datetime.now(UTC).timestamp())
|
||||||
|
start_time = end_time - 600 # 10 minutes ago
|
||||||
|
|
||||||
|
response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="prompt_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response, list)
|
||||||
|
|
||||||
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
||||||
|
|
||||||
|
assert response[0].metric == "prompt_tokens"
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_with_label_matchers(llama_stack_client, text_model_id):
|
||||||
|
"""Test that metrics are queryable with label matchers."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="prompt_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
label_matchers=[{"name": "model_id", "value": text_model_id, "operator": "="}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response[0].values, list), "Should return a list of metric series"
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_nonexistent_metric(llama_stack_client):
|
||||||
|
"""Test that querying a nonexistent metric returns empty data."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="nonexistent_metric",
|
||||||
|
start_time=start_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(response, list), "Should return an empty list for nonexistent metric"
|
||||||
|
assert len(response) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_metrics_with_granularity(llama_stack_client, text_model_id):
|
||||||
|
"""Test that metrics are queryable with different granularity levels."""
|
||||||
|
start_time = int((datetime.now(UTC) - timedelta(minutes=10)).timestamp())
|
||||||
|
|
||||||
|
# Test hourly granularity
|
||||||
|
hourly_response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="total_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
granularity="1h",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test daily granularity
|
||||||
|
daily_response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="total_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
granularity="1d",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test no granularity (raw data points)
|
||||||
|
raw_response = llama_stack_client.telemetry.query_metrics(
|
||||||
|
metric_name="total_tokens",
|
||||||
|
start_time=start_time,
|
||||||
|
granularity=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# All should return valid data
|
||||||
|
assert isinstance(hourly_response[0].values, list), "Hourly granularity should return data"
|
||||||
|
assert isinstance(daily_response[0].values, list), "Daily granularity should return data"
|
||||||
|
assert isinstance(raw_response[0].values, list), "No granularity should return data"
|
||||||
|
|
||||||
|
# Verify that different granularities produce different aggregation levels
|
||||||
|
# (The exact number depends on data distribution, but they should be queryable)
|
||||||
|
assert len(hourly_response[0].values) >= 0, "Hourly granularity should be queryable"
|
||||||
|
assert len(daily_response[0].values) >= 0, "Daily granularity should be queryable"
|
||||||
|
assert len(raw_response[0].values) >= 0, "No granularity should be queryable"
|
Loading…
Add table
Add a link
Reference in a new issue