mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
fix!: update eval-tasks -> benchmarks (#1032)
# What does this PR do? - Update `/eval-tasks` to `/benchmarks` - ⚠️ Remove differentiation between `app` v.s. `benchmark` eval task config. Now we only have `BenchmarkConfig`. The overloaded `benchmark` is confusing and do not add any value. Backward compatibility is being kept as the "type" is not being used anywhere. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan - This change is backward compatible - Run notebook test with ``` pytest -v -s --nbval-lax ./docs/getting_started.ipynb pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="846" alt="image" src="https://github.com/user-attachments/assets/d2fc06a7-593a-444f-bc1f-10ab9b0c843d" /> [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --------- Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Sébastien Han <seb@redhat.com> Signed-off-by: reidliu <reid201711@gmail.com> Co-authored-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Co-authored-by: Ben Browning <ben324@gmail.com> Co-authored-by: Sébastien Han <seb@redhat.com> Co-authored-by: Reid <61492567+reidliu41@users.noreply.github.com> Co-authored-by: reidliu <reid201711@gmail.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
This commit is contained in:
parent
225dd38e5c
commit
8b655e3cd2
60 changed files with 2622 additions and 1910 deletions
2286
docs/_static/llama-stack-spec.html
vendored
2286
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1465
docs/_static/llama-stack-spec.yaml
vendored
1465
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -324,7 +324,7 @@
|
|||
"- vector_io\n",
|
||||
"container_image: null\n",
|
||||
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"image_name: together\n",
|
||||
"metadata_store:\n",
|
||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
|
||||
|
@ -508,7 +508,7 @@
|
|||
"- vector_io\n",
|
||||
"container_image: null\n",
|
||||
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"image_name: together\n",
|
||||
"metadata_store:\n",
|
||||
" db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
|
||||
|
|
|
@ -370,7 +370,7 @@
|
|||
"- tool_runtime\n",
|
||||
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"container_image: null\n",
|
||||
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"image_name: together\n",
|
||||
"memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||
"metadata_store:\n",
|
||||
|
@ -551,7 +551,7 @@
|
|||
"- tool_runtime\n",
|
||||
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"container_image: null\n",
|
||||
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"image_name: together\n",
|
||||
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||
"metadata_store:\n",
|
||||
|
|
|
@ -647,6 +647,7 @@ class Generator:
|
|||
description = "\n".join(
|
||||
filter(None, [doc_string.short_description, doc_string.long_description])
|
||||
)
|
||||
|
||||
return Operation(
|
||||
tags=[op.defining_class.__name__],
|
||||
summary=None,
|
||||
|
@ -656,6 +657,7 @@ class Generator:
|
|||
requestBody=requestBody,
|
||||
responses=responses,
|
||||
callbacks=callbacks,
|
||||
deprecated=True if "DEPRECATED" in op.func_name else None,
|
||||
security=[] if op.public else None,
|
||||
)
|
||||
|
||||
|
|
|
@ -117,6 +117,7 @@ class Operation:
|
|||
requestBody: Optional[RequestBody] = None
|
||||
callbacks: Optional[Dict[str, "Callback"]] = None
|
||||
security: Optional[List["SecurityRequirement"]] = None
|
||||
deprecated: Optional[bool] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -41,14 +41,14 @@ system_message = {
|
|||
"content": SYSTEM_PROMPT_TEMPLATE,
|
||||
}
|
||||
|
||||
client.eval_tasks.register(
|
||||
eval_task_id="meta-reference::mmmu",
|
||||
client.benchmarks.register(
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
dataset_id=f"mmmu-{subset}-{split}",
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::mmmu",
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
|
@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
|
|||
```
|
||||
|
||||
```python
|
||||
client.eval_tasks.register(
|
||||
eval_task_id="meta-reference::simpleqa",
|
||||
client.benchmarks.register(
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
dataset_id=simpleqa_dataset_id,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
@ -156,7 +156,7 @@ agent_config = {
|
|||
}
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
|
|
@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
|
|||
|
||||
```python
|
||||
# Create an evaluation task
|
||||
response = client.eval_tasks.register(
|
||||
eval_task_id="my_eval",
|
||||
response = client.benchmarks.register(
|
||||
benchmark_id="my_eval",
|
||||
dataset_id="my_dataset",
|
||||
scoring_functions=["accuracy", "relevance"],
|
||||
)
|
||||
|
||||
# Run evaluation
|
||||
job = client.eval.run_eval(
|
||||
task_id="my_eval",
|
||||
benchmark_id="my_eval",
|
||||
task_config={
|
||||
"type": "app",
|
||||
"eval_candidate": {"type": "agent", "config": agent_config},
|
||||
|
@ -26,5 +26,5 @@ job = client.eval.run_eval(
|
|||
)
|
||||
|
||||
# Get results
|
||||
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
|
||||
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
|
||||
```
|
||||
|
|
|
@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
|
|||
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
||||
- `/datasetio` + `/datasets` API
|
||||
- `/scoring` + `/scoring_functions` API
|
||||
- `/eval` + `/eval_tasks` API
|
||||
- `/eval` + `/benchmarks` API
|
||||
|
||||
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
||||
|
||||
|
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
|
|||
- **Scoring**: evaluate outputs of the system.
|
||||
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
||||
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
||||
- Associated with `EvalTask` resource.
|
||||
- Associated with `Benchmark` resource.
|
||||
|
||||
|
||||
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
||||
|
|
|
@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
|
|||
- **Tool Runtime** is associated with `ToolGroup` resources.
|
||||
- **DatasetIO** is associated with `Dataset` resources.
|
||||
- **Scoring** is associated with `ScoringFunction` resources.
|
||||
- **Eval** is associated with `Model` and `EvalTask` resources.
|
||||
- **Eval** is associated with `Model` and `Benchmark` resources.
|
||||
|
||||
Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
|
||||
|
||||
|
|
|
@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
|||
```
|
||||
|
||||
```bash
|
||||
$ llama-stack-client eval_tasks register \
|
||||
$ llama-stack-client benchmarks register \
|
||||
--eval-task-id meta-reference-mmlu \
|
||||
--provider-id meta-reference \
|
||||
--dataset-id mmlu \
|
||||
|
@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
|||
- Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
|
||||
|
||||
- **API Resources**: Inspect Llama Stack API resources
|
||||
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
|
||||
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
|
||||
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
|
||||
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
|
|||
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
||||
- `/datasetio` + `/datasets` API
|
||||
- `/scoring` + `/scoring_functions` API
|
||||
- `/eval` + `/eval_tasks` API
|
||||
- `/eval` + `/benchmarks` API
|
||||
|
||||
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
||||
|
||||
|
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
|
|||
- **Scoring**: evaluate outputs of the system.
|
||||
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
||||
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
||||
- Associated with `EvalTask` resource.
|
||||
- Associated with `Benchmark` resource.
|
||||
|
||||
|
||||
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
||||
|
@ -77,14 +77,14 @@ system_message = {
|
|||
"content": SYSTEM_PROMPT_TEMPLATE,
|
||||
}
|
||||
|
||||
client.eval_tasks.register(
|
||||
eval_task_id="meta-reference::mmmu",
|
||||
client.benchmarks.register(
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
dataset_id=f"mmmu-{subset}-{split}",
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::mmmu",
|
||||
benchmark_id="meta-reference::mmmu",
|
||||
input_rows=eval_rows,
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
task_config={
|
||||
|
@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
|
|||
```
|
||||
|
||||
```python
|
||||
client.eval_tasks.register(
|
||||
eval_task_id="meta-reference::simpleqa",
|
||||
client.benchmarks.register(
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
dataset_id=simpleqa_dataset_id,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
)
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
@ -192,7 +192,7 @@ agent_config = {
|
|||
}
|
||||
|
||||
response = client.eval.evaluate_rows(
|
||||
task_id="meta-reference::simpleqa",
|
||||
benchmark_id="meta-reference::simpleqa",
|
||||
input_rows=eval_rows.rows,
|
||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||
task_config={
|
||||
|
@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
|
|||
|
||||
#### Benchmark Evaluation CLI
|
||||
Usage: There are 2 inputs necessary for running a benchmark eval
|
||||
- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
|
||||
- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
|
||||
- `dataset_id`: the identifier associated with the dataset.
|
||||
- `List[scoring_function_id]`: list of scoring function identifiers.
|
||||
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
|
||||
|
@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
|
|||
|
||||
```
|
||||
llama-stack-client eval run_benchmark <eval-task-id> \
|
||||
--eval-task-config ~/eval_task_config.json \
|
||||
--eval-task-config ~/benchmark_config.json \
|
||||
--visualize
|
||||
```
|
||||
|
||||
|
@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
|
|||
--output-dir ./
|
||||
```
|
||||
|
||||
#### Defining EvalTaskConfig
|
||||
The `EvalTaskConfig` are user specified config to define:
|
||||
#### Defining BenchmarkConfig
|
||||
The `BenchmarkConfig` are user specified config to define:
|
||||
1. `EvalCandidate` to run generation on:
|
||||
- `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
|
||||
- `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack /agents API.
|
||||
2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
|
||||
|
||||
|
||||
**Example Benchmark EvalTaskConfig**
|
||||
**Example Benchmark BenchmarkConfig**
|
||||
```json
|
||||
{
|
||||
"type": "benchmark",
|
||||
|
@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
|
|||
}
|
||||
```
|
||||
|
||||
**Example Application EvalTaskConfig**
|
||||
**Example Application BenchmarkConfig**
|
||||
```json
|
||||
{
|
||||
"type": "app",
|
||||
|
|
|
@ -161,14 +161,14 @@ Options:
|
|||
|
||||
## Eval Task Management
|
||||
|
||||
### `llama-stack-client eval_tasks list`
|
||||
### `llama-stack-client benchmarks list`
|
||||
```bash
|
||||
$ llama-stack-client eval_tasks list
|
||||
$ llama-stack-client benchmarks list
|
||||
```
|
||||
|
||||
### `llama-stack-client eval_tasks register`
|
||||
### `llama-stack-client benchmarks register`
|
||||
```bash
|
||||
$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||
$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||
```
|
||||
|
||||
Options:
|
||||
|
@ -191,7 +191,7 @@ Options:
|
|||
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
|
||||
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
|
||||
|
||||
Example eval_task_config.json:
|
||||
Example benchmark_config.json:
|
||||
```json
|
||||
{
|
||||
"type": "benchmark",
|
||||
|
|
|
@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
|
|||
|
||||
Methods:
|
||||
|
||||
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||
- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||
|
||||
### Jobs
|
||||
|
||||
|
@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
|
|||
|
||||
Methods:
|
||||
|
||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
|
||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
|
||||
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||
- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
|
||||
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
|
||||
|
||||
## Inspect
|
||||
|
||||
|
@ -443,20 +443,20 @@ Methods:
|
|||
- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
|
||||
- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
|
||||
|
||||
## EvalTasks
|
||||
## Benchmarks
|
||||
|
||||
Types:
|
||||
|
||||
```python
|
||||
from llama_stack_client.types import (
|
||||
EvalTask,
|
||||
ListEvalTasksResponse,
|
||||
EvalTaskListResponse,
|
||||
Benchmark,
|
||||
ListBenchmarksResponse,
|
||||
BenchmarkListResponse,
|
||||
)
|
||||
```
|
||||
|
||||
Methods:
|
||||
|
||||
- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
|
||||
- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
|
||||
- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
|
||||
- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
|
||||
- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
|
||||
- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .eval_tasks import * # noqa: F401 F403
|
||||
from .benchmarks import * # noqa: F401 F403
|
86
llama_stack/apis/benchmarks/benchmarks.py
Normal file
86
llama_stack/apis/benchmarks/benchmarks.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.resource import Resource, ResourceType
|
||||
|
||||
|
||||
class CommonBenchmarkFields(BaseModel):
|
||||
dataset_id: str
|
||||
scoring_functions: List[str]
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Metadata for this evaluation task",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class Benchmark(CommonBenchmarkFields, Resource):
|
||||
type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
|
||||
|
||||
@property
|
||||
def benchmark_id(self) -> str:
|
||||
return self.identifier
|
||||
|
||||
@property
|
||||
def provider_benchmark_id(self) -> str:
|
||||
return self.provider_resource_id
|
||||
|
||||
|
||||
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
|
||||
benchmark_id: str
|
||||
provider_id: Optional[str] = None
|
||||
provider_benchmark_id: Optional[str] = None
|
||||
|
||||
|
||||
class ListBenchmarksResponse(BaseModel):
|
||||
data: List[Benchmark]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class Benchmarks(Protocol):
|
||||
@webmethod(route="/eval/benchmarks", method="GET")
|
||||
async def list_benchmarks(self) -> ListBenchmarksResponse: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
|
||||
async def get_benchmark(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
) -> Optional[Benchmark]: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks", method="POST")
|
||||
async def register_benchmark(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="GET")
|
||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
|
||||
|
||||
@webmethod(route="/eval-tasks/{task_id}", method="GET")
|
||||
async def DEPRECATED_get_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
) -> Optional[Benchmark]: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="POST")
|
||||
async def DEPRECATED_register_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
|
@ -28,7 +28,7 @@ class Api(Enum):
|
|||
vector_dbs = "vector_dbs"
|
||||
datasets = "datasets"
|
||||
scoring_functions = "scoring_functions"
|
||||
eval_tasks = "eval_tasks"
|
||||
benchmarks = "benchmarks"
|
||||
tool_groups = "tool_groups"
|
||||
|
||||
# built-in API
|
||||
|
|
|
@ -38,19 +38,9 @@ EvalCandidate = register_schema(
|
|||
|
||||
|
||||
@json_schema_type
|
||||
class BenchmarkEvalTaskConfig(BaseModel):
|
||||
class BenchmarkConfig(BaseModel):
|
||||
type: Literal["benchmark"] = "benchmark"
|
||||
eval_candidate: EvalCandidate
|
||||
num_examples: Optional[int] = Field(
|
||||
description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
|
||||
default=None,
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class AppEvalTaskConfig(BaseModel):
|
||||
type: Literal["app"] = "app"
|
||||
eval_candidate: EvalCandidate
|
||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||
default_factory=dict,
|
||||
|
@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
|
|||
# we could optinally add any specific dataset config here
|
||||
|
||||
|
||||
EvalTaskConfig = register_schema(
|
||||
Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
|
||||
name="EvalTaskConfig",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvaluateResponse(BaseModel):
|
||||
generations: List[Dict[str, Any]]
|
||||
|
@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
|
|||
|
||||
|
||||
class Eval(Protocol):
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
|
||||
async def run_eval(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
|
||||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: EvalTaskConfig,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
||||
async def evaluate_rows(
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
||||
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
|
||||
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||
|
||||
from llama_models.schema_utils import json_schema_type, webmethod
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.resource import Resource, ResourceType
|
||||
|
||||
|
||||
class CommonEvalTaskFields(BaseModel):
|
||||
dataset_id: str
|
||||
scoring_functions: List[str]
|
||||
metadata: Dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Metadata for this evaluation task",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class EvalTask(CommonEvalTaskFields, Resource):
|
||||
type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
|
||||
|
||||
@property
|
||||
def eval_task_id(self) -> str:
|
||||
return self.identifier
|
||||
|
||||
@property
|
||||
def provider_eval_task_id(self) -> str:
|
||||
return self.provider_resource_id
|
||||
|
||||
|
||||
class EvalTaskInput(CommonEvalTaskFields, BaseModel):
|
||||
eval_task_id: str
|
||||
provider_id: Optional[str] = None
|
||||
provider_eval_task_id: Optional[str] = None
|
||||
|
||||
|
||||
class ListEvalTasksResponse(BaseModel):
|
||||
data: List[EvalTask]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class EvalTasks(Protocol):
|
||||
@webmethod(route="/eval-tasks", method="GET")
|
||||
async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
|
||||
|
||||
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
|
||||
async def get_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
) -> Optional[EvalTask]: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="POST")
|
||||
async def register_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_eval_task_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
|
@ -15,7 +15,7 @@ class ResourceType(Enum):
|
|||
vector_db = "vector_db"
|
||||
dataset = "dataset"
|
||||
scoring_function = "scoring_function"
|
||||
eval_task = "eval_task"
|
||||
benchmark = "benchmark"
|
||||
tool = "tool"
|
||||
tool_group = "tool_group"
|
||||
|
||||
|
|
|
@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union
|
|||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Dataset, DatasetInput
|
||||
from llama_stack.apis.eval import Eval
|
||||
from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
|
||||
from llama_stack.apis.inference import Inference
|
||||
from llama_stack.apis.models import Model, ModelInput
|
||||
from llama_stack.apis.safety import Safety
|
||||
|
@ -37,7 +37,7 @@ RoutableObject = Union[
|
|||
VectorDB,
|
||||
Dataset,
|
||||
ScoringFn,
|
||||
EvalTask,
|
||||
Benchmark,
|
||||
Tool,
|
||||
ToolGroup,
|
||||
]
|
||||
|
@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[
|
|||
VectorDB,
|
||||
Dataset,
|
||||
ScoringFn,
|
||||
EvalTask,
|
||||
Benchmark,
|
||||
Tool,
|
||||
ToolGroup,
|
||||
],
|
||||
|
@ -173,7 +173,7 @@ a default SQLite store will be used.""",
|
|||
vector_dbs: List[VectorDBInput] = Field(default_factory=list)
|
||||
datasets: List[DatasetInput] = Field(default_factory=list)
|
||||
scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
|
||||
eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
|
||||
benchmarks: List[BenchmarkInput] = Field(default_factory=list)
|
||||
tool_groups: List[ToolGroupInput] = Field(default_factory=list)
|
||||
|
||||
server: ServerConfig = Field(
|
||||
|
|
|
@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
|
|||
router_api=Api.scoring,
|
||||
),
|
||||
AutoRoutedApiInfo(
|
||||
routing_table_api=Api.eval_tasks,
|
||||
routing_table_api=Api.benchmarks,
|
||||
router_api=Api.eval,
|
||||
),
|
||||
AutoRoutedApiInfo(
|
||||
|
|
|
@ -9,10 +9,10 @@ import logging
|
|||
from typing import Any, Dict, List, Set
|
||||
|
||||
from llama_stack.apis.agents import Agents
|
||||
from llama_stack.apis.benchmarks import Benchmarks
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval import Eval
|
||||
from llama_stack.apis.eval_tasks import EvalTasks
|
||||
from llama_stack.apis.inference import Inference
|
||||
from llama_stack.apis.inspect import Inspect
|
||||
from llama_stack.apis.models import Models
|
||||
|
@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry
|
|||
from llama_stack.distribution.utils.dynamic import instantiate_class_type
|
||||
from llama_stack.providers.datatypes import (
|
||||
Api,
|
||||
BenchmarksProtocolPrivate,
|
||||
DatasetsProtocolPrivate,
|
||||
EvalTasksProtocolPrivate,
|
||||
InlineProviderSpec,
|
||||
ModelsProtocolPrivate,
|
||||
ProviderSpec,
|
||||
|
@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
|
|||
Api.scoring: Scoring,
|
||||
Api.scoring_functions: ScoringFunctions,
|
||||
Api.eval: Eval,
|
||||
Api.eval_tasks: EvalTasks,
|
||||
Api.benchmarks: Benchmarks,
|
||||
Api.post_training: PostTraining,
|
||||
Api.tool_groups: ToolGroups,
|
||||
Api.tool_runtime: ToolRuntime,
|
||||
|
@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
|
|||
ScoringFunctions,
|
||||
Api.scoring_functions,
|
||||
),
|
||||
Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
|
||||
Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry
|
|||
from llama_stack.providers.datatypes import Api, RoutingTable
|
||||
|
||||
from .routing_tables import (
|
||||
BenchmarksRoutingTable,
|
||||
DatasetsRoutingTable,
|
||||
EvalTasksRoutingTable,
|
||||
ModelsRoutingTable,
|
||||
ScoringFunctionsRoutingTable,
|
||||
ShieldsRoutingTable,
|
||||
|
@ -33,7 +33,7 @@ async def get_routing_table_impl(
|
|||
"shields": ShieldsRoutingTable,
|
||||
"datasets": DatasetsRoutingTable,
|
||||
"scoring_functions": ScoringFunctionsRoutingTable,
|
||||
"eval_tasks": EvalTasksRoutingTable,
|
||||
"benchmarks": BenchmarksRoutingTable,
|
||||
"tool_groups": ToolGroupsRoutingTable,
|
||||
}
|
||||
|
||||
|
|
|
@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
|
|||
from llama_stack.apis.common.content_types import URL, InterleavedContent
|
||||
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
|
||||
from llama_stack.apis.eval import (
|
||||
AppEvalTaskConfig,
|
||||
BenchmarkConfig,
|
||||
Eval,
|
||||
EvalTaskConfig,
|
||||
EvaluateResponse,
|
||||
Job,
|
||||
JobStatus,
|
||||
|
@ -347,23 +346,23 @@ class EvalRouter(Eval):
|
|||
|
||||
async def run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: AppEvalTaskConfig,
|
||||
benchmark_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.routing_table.get_provider_impl(task_id).run_eval(
|
||||
task_id=task_id,
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
|
||||
benchmark_id=benchmark_id,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
|
||||
task_id=task_id,
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
|
||||
benchmark_id=benchmark_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
|
@ -371,30 +370,72 @@ class EvalRouter(Eval):
|
|||
|
||||
async def job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
|
||||
|
||||
async def job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
await self.routing_table.get_provider_impl(task_id).job_cancel(
|
||||
task_id,
|
||||
await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
|
||||
benchmark_id,
|
||||
job_id,
|
||||
)
|
||||
|
||||
async def job_result(
|
||||
self,
|
||||
benchmark_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.routing_table.get_provider_impl(benchmark_id).job_result(
|
||||
benchmark_id,
|
||||
job_id,
|
||||
)
|
||||
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.evaluate_rows(
|
||||
benchmark_id=task_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def DEPRECATED_job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_result(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.routing_table.get_provider_impl(task_id).job_result(
|
||||
task_id,
|
||||
job_id,
|
||||
)
|
||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
|
||||
class ToolRuntimeRouter(ToolRuntime):
|
||||
|
|
|
@ -4,14 +4,15 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import TypeAdapter
|
||||
|
||||
from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
|
||||
from llama_stack.apis.common.content_types import URL
|
||||
from llama_stack.apis.common.type_system import ParamType
|
||||
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
|
||||
from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
|
||||
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
|
||||
from llama_stack.apis.resource import ResourceType
|
||||
from llama_stack.apis.scoring_functions import (
|
||||
|
@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import (
|
|||
from llama_stack.distribution.store import DistributionRegistry
|
||||
from llama_stack.providers.datatypes import Api, RoutingTable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_impl_api(p: Any) -> Api:
|
||||
return p.__provider_spec__.api
|
||||
|
@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
|
|||
elif api == Api.scoring:
|
||||
return await p.register_scoring_function(obj)
|
||||
elif api == Api.eval:
|
||||
return await p.register_eval_task(obj)
|
||||
return await p.register_benchmark(obj)
|
||||
elif api == Api.tool_runtime:
|
||||
return await p.register_tool(obj)
|
||||
else:
|
||||
|
@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
|||
scoring_functions = await p.list_scoring_functions()
|
||||
await add_objects(scoring_functions, pid, ScoringFn)
|
||||
elif api == Api.eval:
|
||||
p.eval_task_store = self
|
||||
p.benchmark_store = self
|
||||
elif api == Api.tool_runtime:
|
||||
p.tool_store = self
|
||||
|
||||
|
@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable):
|
|||
return ("DatasetIO", "dataset")
|
||||
elif isinstance(self, ScoringFunctionsRoutingTable):
|
||||
return ("Scoring", "scoring_function")
|
||||
elif isinstance(self, EvalTasksRoutingTable):
|
||||
return ("Eval", "eval_task")
|
||||
elif isinstance(self, BenchmarksRoutingTable):
|
||||
return ("Eval", "benchmark")
|
||||
elif isinstance(self, ToolGroupsRoutingTable):
|
||||
return ("Tools", "tool")
|
||||
else:
|
||||
|
@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
|||
await self.register_object(scoring_fn)
|
||||
|
||||
|
||||
class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
|
||||
async def list_eval_tasks(self) -> ListEvalTasksResponse:
|
||||
return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
|
||||
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||
async def list_benchmarks(self) -> ListBenchmarksResponse:
|
||||
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
|
||||
|
||||
async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
|
||||
return await self.get_object_by_identifier("eval_task", eval_task_id)
|
||||
async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
|
||||
return await self.get_object_by_identifier("benchmark", benchmark_id)
|
||||
|
||||
async def register_eval_task(
|
||||
async def register_benchmark(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
benchmark_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
provider_eval_task_id: Optional[str] = None,
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
) -> None:
|
||||
if metadata is None:
|
||||
|
@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
|
|||
raise ValueError(
|
||||
"No provider specified and multiple providers available. Please specify a provider_id."
|
||||
)
|
||||
if provider_eval_task_id is None:
|
||||
provider_eval_task_id = eval_task_id
|
||||
eval_task = EvalTask(
|
||||
identifier=eval_task_id,
|
||||
if provider_benchmark_id is None:
|
||||
provider_benchmark_id = benchmark_id
|
||||
benchmark = Benchmark(
|
||||
identifier=benchmark_id,
|
||||
dataset_id=dataset_id,
|
||||
scoring_functions=scoring_functions,
|
||||
metadata=metadata,
|
||||
provider_id=provider_id,
|
||||
provider_resource_id=provider_eval_task_id,
|
||||
provider_resource_id=provider_benchmark_id,
|
||||
)
|
||||
await self.register_object(benchmark)
|
||||
|
||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.list_benchmarks()
|
||||
|
||||
async def DEPRECATED_get_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
) -> Optional[Benchmark]:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.get_benchmark(eval_task_id)
|
||||
|
||||
async def DEPRECATED_register_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.register_benchmark(
|
||||
benchmark_id=eval_task_id,
|
||||
dataset_id=dataset_id,
|
||||
scoring_functions=scoring_functions,
|
||||
metadata=metadata,
|
||||
provider_benchmark_id=provider_benchmark_id,
|
||||
)
|
||||
await self.register_object(eval_task)
|
||||
|
||||
|
||||
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||
|
|
|
@ -15,10 +15,10 @@ from termcolor import colored
|
|||
|
||||
from llama_stack.apis.agents import Agents
|
||||
from llama_stack.apis.batch_inference import BatchInference
|
||||
from llama_stack.apis.benchmarks import Benchmarks
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval import Eval
|
||||
from llama_stack.apis.eval_tasks import EvalTasks
|
||||
from llama_stack.apis.inference import Inference
|
||||
from llama_stack.apis.inspect import Inspect
|
||||
from llama_stack.apis.models import Models
|
||||
|
@ -53,7 +53,7 @@ class LlamaStack(
|
|||
PostTraining,
|
||||
VectorIO,
|
||||
Eval,
|
||||
EvalTasks,
|
||||
Benchmarks,
|
||||
Scoring,
|
||||
ScoringFunctions,
|
||||
DatasetIO,
|
||||
|
@ -78,7 +78,7 @@ RESOURCES = [
|
|||
"register_scoring_function",
|
||||
"list_scoring_functions",
|
||||
),
|
||||
("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
|
||||
("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
|
||||
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
|
||||
]
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
|
|||
```
|
||||
|
||||
```bash
|
||||
$ llama-stack-client eval_tasks register \
|
||||
$ llama-stack-client benchmarks register \
|
||||
--eval-task-id meta-reference-mmlu \
|
||||
--provider-id meta-reference \
|
||||
--dataset-id mmlu \
|
||||
|
|
|
@ -8,12 +8,12 @@ import streamlit as st
|
|||
from modules.api import llama_stack_api
|
||||
|
||||
|
||||
def eval_tasks():
|
||||
# Eval Tasks Section
|
||||
st.header("Eval Tasks")
|
||||
def benchmarks():
|
||||
# Benchmarks Section
|
||||
st.header("Benchmarks")
|
||||
|
||||
eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
|
||||
benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
|
||||
|
||||
if len(eval_tasks_info) > 0:
|
||||
selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
|
||||
st.json(eval_tasks_info[selected_eval_task], expanded=True)
|
||||
if len(benchmarks_info) > 0:
|
||||
selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
|
||||
st.json(benchmarks_info[selected_benchmark], expanded=True)
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from page.distribution.benchmarks import benchmarks
|
||||
from page.distribution.datasets import datasets
|
||||
from page.distribution.eval_tasks import eval_tasks
|
||||
from page.distribution.models import models
|
||||
from page.distribution.scoring_functions import scoring_functions
|
||||
from page.distribution.shields import shields
|
||||
|
@ -20,7 +20,7 @@ def resources_page():
|
|||
"Shields",
|
||||
"Scoring Functions",
|
||||
"Datasets",
|
||||
"Eval Tasks",
|
||||
"Benchmarks",
|
||||
]
|
||||
icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
|
||||
selected_resource = option_menu(
|
||||
|
@ -34,8 +34,8 @@ def resources_page():
|
|||
},
|
||||
},
|
||||
)
|
||||
if selected_resource == "Eval Tasks":
|
||||
eval_tasks()
|
||||
if selected_resource == "Benchmarks":
|
||||
benchmarks()
|
||||
elif selected_resource == "Vector Databases":
|
||||
vector_dbs()
|
||||
elif selected_resource == "Datasets":
|
||||
|
|
|
@ -11,28 +11,28 @@ import streamlit as st
|
|||
from modules.api import llama_stack_api
|
||||
|
||||
|
||||
def select_eval_task_1():
|
||||
# Select Eval Tasks
|
||||
def select_benchmark_1():
|
||||
# Select Benchmarks
|
||||
st.subheader("1. Choose An Eval Task")
|
||||
eval_tasks = llama_stack_api.client.eval_tasks.list()
|
||||
eval_tasks = {et.identifier: et for et in eval_tasks}
|
||||
eval_tasks_names = list(eval_tasks.keys())
|
||||
selected_eval_task = st.selectbox(
|
||||
benchmarks = llama_stack_api.client.benchmarks.list()
|
||||
benchmarks = {et.identifier: et for et in benchmarks}
|
||||
benchmarks_names = list(benchmarks.keys())
|
||||
selected_benchmark = st.selectbox(
|
||||
"Choose an eval task.",
|
||||
options=eval_tasks_names,
|
||||
options=benchmarks_names,
|
||||
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
||||
)
|
||||
with st.expander("View Eval Task"):
|
||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
||||
st.json(benchmarks[selected_benchmark], expanded=True)
|
||||
|
||||
st.session_state["selected_eval_task"] = selected_eval_task
|
||||
st.session_state["eval_tasks"] = eval_tasks
|
||||
st.session_state["selected_benchmark"] = selected_benchmark
|
||||
st.session_state["benchmarks"] = benchmarks
|
||||
if st.button("Confirm", key="confirm_1"):
|
||||
st.session_state["selected_eval_task_1_next"] = True
|
||||
st.session_state["selected_benchmark_1_next"] = True
|
||||
|
||||
|
||||
def define_eval_candidate_2():
|
||||
if not st.session_state.get("selected_eval_task_1_next", None):
|
||||
if not st.session_state.get("selected_benchmark_1_next", None):
|
||||
return
|
||||
|
||||
st.subheader("2. Define Eval Candidate")
|
||||
|
@ -161,11 +161,11 @@ def run_evaluation_3():
|
|||
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
||||
"""
|
||||
)
|
||||
selected_eval_task = st.session_state["selected_eval_task"]
|
||||
eval_tasks = st.session_state["eval_tasks"]
|
||||
selected_benchmark = st.session_state["selected_benchmark"]
|
||||
benchmarks = st.session_state["benchmarks"]
|
||||
eval_candidate = st.session_state["eval_candidate"]
|
||||
|
||||
dataset_id = eval_tasks[selected_eval_task].dataset_id
|
||||
dataset_id = benchmarks[selected_benchmark].dataset_id
|
||||
rows = llama_stack_api.client.datasetio.get_rows_paginated(
|
||||
dataset_id=dataset_id,
|
||||
rows_in_page=-1,
|
||||
|
@ -180,16 +180,16 @@ def run_evaluation_3():
|
|||
help="Number of examples from the dataset to evaluate. ",
|
||||
)
|
||||
|
||||
eval_task_config = {
|
||||
benchmark_config = {
|
||||
"type": "benchmark",
|
||||
"eval_candidate": eval_candidate,
|
||||
"scoring_params": {},
|
||||
}
|
||||
|
||||
with st.expander("View Evaluation Task", expanded=True):
|
||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
||||
st.json(benchmarks[selected_benchmark], expanded=True)
|
||||
with st.expander("View Evaluation Task Configuration", expanded=True):
|
||||
st.json(eval_task_config, expanded=True)
|
||||
st.json(benchmark_config, expanded=True)
|
||||
|
||||
# Add run button and handle evaluation
|
||||
if st.button("Run Evaluation"):
|
||||
|
@ -209,10 +209,10 @@ def run_evaluation_3():
|
|||
progress_bar.progress(progress, text=progress_text)
|
||||
# Run evaluation for current row
|
||||
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
||||
task_id=selected_eval_task,
|
||||
benchmark_id=selected_benchmark,
|
||||
input_rows=[r],
|
||||
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
|
||||
task_config=eval_task_config,
|
||||
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
||||
task_config=benchmark_config,
|
||||
)
|
||||
|
||||
for k in r.keys():
|
||||
|
@ -225,7 +225,7 @@ def run_evaluation_3():
|
|||
output_res[k] = []
|
||||
output_res[k].append(eval_res.generations[0][k])
|
||||
|
||||
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
|
||||
for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
|
||||
if scoring_fn not in output_res:
|
||||
output_res[scoring_fn] = []
|
||||
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
|
||||
|
@ -245,7 +245,7 @@ def native_evaluation_page():
|
|||
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
||||
st.title("📊 Evaluations (Generation + Scoring)")
|
||||
|
||||
select_eval_task_1()
|
||||
select_benchmark_1()
|
||||
define_eval_candidate_2()
|
||||
run_evaluation_3()
|
||||
|
||||
|
|
|
@ -10,9 +10,9 @@ from urllib.parse import urlparse
|
|||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.apis.benchmarks import Benchmark
|
||||
from llama_stack.apis.datasets import Dataset
|
||||
from llama_stack.apis.datatypes import Api
|
||||
from llama_stack.apis.eval_tasks import EvalTask
|
||||
from llama_stack.apis.models import Model
|
||||
from llama_stack.apis.scoring_functions import ScoringFn
|
||||
from llama_stack.apis.shields import Shield
|
||||
|
@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
|
|||
async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
|
||||
|
||||
|
||||
class EvalTasksProtocolPrivate(Protocol):
|
||||
async def register_eval_task(self, eval_task: EvalTask) -> None: ...
|
||||
class BenchmarksProtocolPrivate(Protocol):
|
||||
async def register_benchmark(self, benchmark: Benchmark) -> None: ...
|
||||
|
||||
|
||||
class ToolsProtocolPrivate(Protocol):
|
||||
|
|
|
@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
|
|||
from tqdm import tqdm
|
||||
|
||||
from llama_stack.apis.agents import Agents, StepType
|
||||
from llama_stack.apis.benchmarks import Benchmark
|
||||
from llama_stack.apis.datasetio import DatasetIO
|
||||
from llama_stack.apis.datasets import Datasets
|
||||
from llama_stack.apis.eval_tasks import EvalTask
|
||||
from llama_stack.apis.inference import Inference, UserMessage
|
||||
from llama_stack.apis.scoring import Scoring
|
||||
from llama_stack.distribution.datatypes import Api
|
||||
from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
|
||||
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
|
||||
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
|
||||
MEMORY_QUERY_TOOL,
|
||||
)
|
||||
|
@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
|
|||
from llama_stack.providers.utils.kvstore import kvstore_impl
|
||||
|
||||
from .....apis.common.job_types import Job
|
||||
from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
|
||||
from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
|
||||
from .config import MetaReferenceEvalConfig
|
||||
|
||||
EVAL_TASKS_PREFIX = "eval_tasks:"
|
||||
EVAL_TASKS_PREFIX = "benchmarks:"
|
||||
|
||||
|
||||
class MetaReferenceEvalImpl(
|
||||
Eval,
|
||||
EvalTasksProtocolPrivate,
|
||||
BenchmarksProtocolPrivate,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
|
|||
# TODO: assume sync job, will need jobs API for async scheduling
|
||||
self.jobs = {}
|
||||
|
||||
self.eval_tasks = {}
|
||||
self.benchmarks = {}
|
||||
|
||||
async def initialize(self) -> None:
|
||||
self.kvstore = await kvstore_impl(self.config.kvstore)
|
||||
# Load existing eval_tasks from kvstore
|
||||
# Load existing benchmarks from kvstore
|
||||
start_key = EVAL_TASKS_PREFIX
|
||||
end_key = f"{EVAL_TASKS_PREFIX}\xff"
|
||||
stored_eval_tasks = await self.kvstore.range(start_key, end_key)
|
||||
stored_benchmarks = await self.kvstore.range(start_key, end_key)
|
||||
|
||||
for eval_task in stored_eval_tasks:
|
||||
eval_task = EvalTask.model_validate_json(eval_task)
|
||||
self.eval_tasks[eval_task.identifier] = eval_task
|
||||
for benchmark in stored_benchmarks:
|
||||
benchmark = Benchmark.model_validate_json(benchmark)
|
||||
self.benchmarks[benchmark.identifier] = benchmark
|
||||
|
||||
async def shutdown(self) -> None: ...
|
||||
|
||||
async def register_eval_task(self, task_def: EvalTask) -> None:
|
||||
async def register_benchmark(self, task_def: Benchmark) -> None:
|
||||
# Store in kvstore
|
||||
key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
|
||||
await self.kvstore.set(
|
||||
key=key,
|
||||
value=task_def.model_dump_json(),
|
||||
)
|
||||
self.eval_tasks[task_def.identifier] = task_def
|
||||
self.benchmarks[task_def.identifier] = task_def
|
||||
|
||||
async def run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: EvalTaskConfig,
|
||||
benchmark_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
task_def = self.eval_tasks[task_id]
|
||||
task_def = self.benchmarks[benchmark_id]
|
||||
dataset_id = task_def.dataset_id
|
||||
candidate = task_config.eval_candidate
|
||||
scoring_functions = task_def.scoring_functions
|
||||
|
@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
|
|||
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
||||
)
|
||||
res = await self.evaluate_rows(
|
||||
task_id=task_id,
|
||||
benchmark_id=benchmark_id,
|
||||
input_rows=all_rows.rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
|
@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
|
|||
return Job(job_id=job_id)
|
||||
|
||||
async def _run_agent_generation(
|
||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
||||
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
|
||||
) -> List[Dict[str, Any]]:
|
||||
candidate = task_config.eval_candidate
|
||||
create_response = await self.agents_api.create_agent(candidate.config)
|
||||
|
@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
|
|||
return generations
|
||||
|
||||
async def _run_model_generation(
|
||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
||||
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
|
||||
) -> List[Dict[str, Any]]:
|
||||
candidate = task_config.eval_candidate
|
||||
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
|
||||
|
@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(
|
|||
|
||||
async def evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
benchmark_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: EvalTaskConfig,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
candidate = task_config.eval_candidate
|
||||
if candidate.type == "agent":
|
||||
|
@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
|
|||
# scoring with generated_answer
|
||||
score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
|
||||
|
||||
if task_config.type == "app" and task_config.scoring_params is not None:
|
||||
if task_config.scoring_params is not None:
|
||||
scoring_functions_dict = {
|
||||
scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
|
||||
for scoring_fn_id in scoring_functions
|
||||
|
@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(
|
|||
|
||||
return EvaluateResponse(generations=generations, scores=score_response.results)
|
||||
|
||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
|
||||
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
|
||||
if job_id in self.jobs:
|
||||
return JobStatus.completed
|
||||
|
||||
return None
|
||||
|
||||
async def job_cancel(self, task_id: str, job_id: str) -> None:
|
||||
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||
raise NotImplementedError("Job cancel is not implemented yet")
|
||||
|
||||
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
|
||||
status = await self.job_status(task_id, job_id)
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||
status = await self.job_status(benchmark_id, job_id)
|
||||
if not status or status != JobStatus.completed:
|
||||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||
|
||||
return self.jobs[job_id]
|
||||
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.evaluate_rows(
|
||||
benchmark_id=task_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def DEPRECATED_job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_result(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||
|
|
|
@ -10,8 +10,8 @@ import pytest
|
|||
from llama_stack.apis.common.content_types import URL
|
||||
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
|
||||
from llama_stack.apis.eval.eval import (
|
||||
AppEvalTaskConfig,
|
||||
BenchmarkEvalTaskConfig,
|
||||
AppBenchmarkConfig,
|
||||
BenchmarkBenchmarkConfig,
|
||||
ModelCandidate,
|
||||
)
|
||||
from llama_stack.apis.inference import SamplingParams
|
||||
|
@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT
|
|||
|
||||
class Testeval:
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_tasks_list(self, eval_stack):
|
||||
async def test_benchmarks_list(self, eval_stack):
|
||||
# NOTE: this needs you to ensure that you are starting from a clean state
|
||||
# but so far we don't have an unregister API unfortunately, so be careful
|
||||
eval_tasks_impl = eval_stack[Api.eval_tasks]
|
||||
response = await eval_tasks_impl.list_eval_tasks()
|
||||
benchmarks_impl = eval_stack[Api.benchmarks]
|
||||
response = await benchmarks_impl.list_benchmarks()
|
||||
assert isinstance(response, list)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
|
||||
eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
|
||||
eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
|
||||
eval_stack[Api.eval],
|
||||
eval_stack[Api.eval_tasks],
|
||||
eval_stack[Api.benchmarks],
|
||||
eval_stack[Api.datasetio],
|
||||
eval_stack[Api.datasets],
|
||||
eval_stack[Api.models],
|
||||
|
@ -59,17 +59,17 @@ class Testeval:
|
|||
scoring_functions = [
|
||||
"basic::equality",
|
||||
]
|
||||
task_id = "meta-reference::app_eval"
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id=task_id,
|
||||
benchmark_id = "meta-reference::app_eval"
|
||||
await benchmarks_impl.register_benchmark(
|
||||
benchmark_id=benchmark_id,
|
||||
dataset_id="test_dataset_for_eval",
|
||||
scoring_functions=scoring_functions,
|
||||
)
|
||||
response = await eval_impl.evaluate_rows(
|
||||
task_id=task_id,
|
||||
benchmark_id=benchmark_id,
|
||||
input_rows=rows.rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=AppEvalTaskConfig(
|
||||
task_config=AppBenchmarkConfig(
|
||||
eval_candidate=ModelCandidate(
|
||||
model=inference_model,
|
||||
sampling_params=SamplingParams(),
|
||||
|
@ -92,9 +92,9 @@ class Testeval:
|
|||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
|
||||
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
|
||||
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
|
||||
eval_stack[Api.eval],
|
||||
eval_stack[Api.eval_tasks],
|
||||
eval_stack[Api.benchmarks],
|
||||
eval_stack[Api.datasets],
|
||||
eval_stack[Api.models],
|
||||
)
|
||||
|
@ -105,15 +105,15 @@ class Testeval:
|
|||
"basic::subset_of",
|
||||
]
|
||||
|
||||
task_id = "meta-reference::app_eval-2"
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id=task_id,
|
||||
benchmark_id = "meta-reference::app_eval-2"
|
||||
await benchmarks_impl.register_benchmark(
|
||||
benchmark_id=benchmark_id,
|
||||
dataset_id="test_dataset_for_eval",
|
||||
scoring_functions=scoring_functions,
|
||||
)
|
||||
response = await eval_impl.run_eval(
|
||||
task_id=task_id,
|
||||
task_config=AppEvalTaskConfig(
|
||||
benchmark_id=benchmark_id,
|
||||
task_config=AppBenchmarkConfig(
|
||||
eval_candidate=ModelCandidate(
|
||||
model=inference_model,
|
||||
sampling_params=SamplingParams(),
|
||||
|
@ -121,9 +121,9 @@ class Testeval:
|
|||
),
|
||||
)
|
||||
assert response.job_id == "0"
|
||||
job_status = await eval_impl.job_status(task_id, response.job_id)
|
||||
job_status = await eval_impl.job_status(benchmark_id, response.job_id)
|
||||
assert job_status and job_status.value == "completed"
|
||||
eval_response = await eval_impl.job_result(task_id, response.job_id)
|
||||
eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
|
||||
|
||||
assert eval_response is not None
|
||||
assert len(eval_response.generations) == 5
|
||||
|
@ -131,9 +131,9 @@ class Testeval:
|
|||
|
||||
@pytest.mark.asyncio
|
||||
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
|
||||
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
|
||||
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
|
||||
eval_stack[Api.eval],
|
||||
eval_stack[Api.eval_tasks],
|
||||
eval_stack[Api.benchmarks],
|
||||
eval_stack[Api.datasets],
|
||||
eval_stack[Api.models],
|
||||
)
|
||||
|
@ -159,20 +159,20 @@ class Testeval:
|
|||
)
|
||||
|
||||
# register eval task
|
||||
await eval_tasks_impl.register_eval_task(
|
||||
eval_task_id="meta-reference-mmlu",
|
||||
await benchmarks_impl.register_benchmark(
|
||||
benchmark_id="meta-reference-mmlu",
|
||||
dataset_id="mmlu",
|
||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||
)
|
||||
|
||||
# list benchmarks
|
||||
response = await eval_tasks_impl.list_eval_tasks()
|
||||
response = await benchmarks_impl.list_benchmarks()
|
||||
assert len(response) > 0
|
||||
|
||||
benchmark_id = "meta-reference-mmlu"
|
||||
response = await eval_impl.run_eval(
|
||||
task_id=benchmark_id,
|
||||
task_config=BenchmarkEvalTaskConfig(
|
||||
benchmark_id=benchmark_id,
|
||||
task_config=BenchmarkBenchmarkConfig(
|
||||
eval_candidate=ModelCandidate(
|
||||
model=inference_model,
|
||||
sampling_params=SamplingParams(),
|
||||
|
|
|
@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional
|
|||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llama_stack.apis.benchmarks import BenchmarkInput
|
||||
from llama_stack.apis.datasets import DatasetInput
|
||||
from llama_stack.apis.eval_tasks import EvalTaskInput
|
||||
from llama_stack.apis.models import ModelInput
|
||||
from llama_stack.apis.scoring_functions import ScoringFnInput
|
||||
from llama_stack.apis.shields import ShieldInput
|
||||
|
@ -42,7 +42,7 @@ async def construct_stack_for_test(
|
|||
vector_dbs: Optional[List[VectorDBInput]] = None,
|
||||
datasets: Optional[List[DatasetInput]] = None,
|
||||
scoring_fns: Optional[List[ScoringFnInput]] = None,
|
||||
eval_tasks: Optional[List[EvalTaskInput]] = None,
|
||||
benchmarks: Optional[List[BenchmarkInput]] = None,
|
||||
tool_groups: Optional[List[ToolGroupInput]] = None,
|
||||
) -> TestStack:
|
||||
sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
|
||||
|
@ -56,7 +56,7 @@ async def construct_stack_for_test(
|
|||
vector_dbs=vector_dbs or [],
|
||||
datasets=datasets or [],
|
||||
scoring_fns=scoring_fns or [],
|
||||
eval_tasks=eval_tasks or [],
|
||||
benchmarks=benchmarks or [],
|
||||
tool_groups=tool_groups or [],
|
||||
)
|
||||
run_config = parse_and_maybe_upgrade_config(run_config)
|
||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -108,7 +108,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: brave-search
|
||||
|
|
|
@ -99,7 +99,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: brave-search
|
||||
|
|
|
@ -85,4 +85,4 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
|
|
|
@ -164,7 +164,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -153,7 +153,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -116,7 +116,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -106,7 +106,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -116,7 +116,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -106,7 +106,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -139,7 +139,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -113,7 +113,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -110,7 +110,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -106,7 +106,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -105,7 +105,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -159,7 +159,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -148,7 +148,7 @@ shields:
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
|||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue