fix!: update eval-tasks -> benchmarks (#1032)

# What does this PR do?

- Update `/eval-tasks` to `/benchmarks`
- ⚠️ Remove differentiation between `app` v.s. `benchmark` eval task
config. Now we only have `BenchmarkConfig`. The overloaded `benchmark`
is confusing and do not add any value. Backward compatibility is being
kept as the "type" is not being used anywhere.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
- This change is backward compatible 
- Run notebook test with

```
pytest -v -s --nbval-lax ./docs/getting_started.ipynb
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
```

<img width="846" alt="image"
src="https://github.com/user-attachments/assets/d2fc06a7-593a-444f-bc1f-10ab9b0c843d"
/>



[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

---------

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Signed-off-by: Ben Browning <bbrownin@redhat.com>
Signed-off-by: Sébastien Han <seb@redhat.com>
Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Co-authored-by: Ben Browning <ben324@gmail.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Reid <61492567+reidliu41@users.noreply.github.com>
Co-authored-by: reidliu <reid201711@gmail.com>
Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
This commit is contained in:
Xi Yan 2025-02-13 16:40:58 -08:00 committed by GitHub
parent 225dd38e5c
commit 8b655e3cd2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
60 changed files with 2622 additions and 1910 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -324,7 +324,7 @@
"- vector_io\n",
"container_image: null\n",
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
"image_name: together\n",
"metadata_store:\n",
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
@ -508,7 +508,7 @@
"- vector_io\n",
"container_image: null\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: together\n",
"metadata_store:\n",
" db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",

View file

@ -370,7 +370,7 @@
"- tool_runtime\n",
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
"container_image: null\n",
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
"image_name: together\n",
"memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
"metadata_store:\n",
@ -551,7 +551,7 @@
"- tool_runtime\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"container_image: null\n",
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: together\n",
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"metadata_store:\n",

View file

@ -647,6 +647,7 @@ class Generator:
description = "\n".join(
filter(None, [doc_string.short_description, doc_string.long_description])
)
return Operation(
tags=[op.defining_class.__name__],
summary=None,
@ -656,6 +657,7 @@ class Generator:
requestBody=requestBody,
responses=responses,
callbacks=callbacks,
deprecated=True if "DEPRECATED" in op.func_name else None,
security=[] if op.public else None,
)

View file

@ -117,6 +117,7 @@ class Operation:
requestBody: Optional[RequestBody] = None
callbacks: Optional[Dict[str, "Callback"]] = None
security: Optional[List["SecurityRequirement"]] = None
deprecated: Optional[bool] = None
@dataclass

View file

@ -41,14 +41,14 @@ system_message = {
"content": SYSTEM_PROMPT_TEMPLATE,
}
client.eval_tasks.register(
eval_task_id="meta-reference::mmmu",
client.benchmarks.register(
benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
task_id="meta-reference::mmmu",
benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
```
```python
client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa",
client.benchmarks.register(
benchmark_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
task_id="meta-reference::simpleqa",
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@ -156,7 +156,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
task_id="meta-reference::simpleqa",
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={

View file

@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
```python
# Create an evaluation task
response = client.eval_tasks.register(
eval_task_id="my_eval",
response = client.benchmarks.register(
benchmark_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"],
)
# Run evaluation
job = client.eval.run_eval(
task_id="my_eval",
benchmark_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {"type": "agent", "config": agent_config},
@ -26,5 +26,5 @@ job = client.eval.run_eval(
)
# Get results
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
```

View file

@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
- `/datasetio` + `/datasets` API
- `/scoring` + `/scoring_functions` API
- `/eval` + `/eval_tasks` API
- `/eval` + `/benchmarks` API
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- **Scoring**: evaluate outputs of the system.
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
- Associated with `EvalTask` resource.
- Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow.

View file

@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
- **Tool Runtime** is associated with `ToolGroup` resources.
- **DatasetIO** is associated with `Dataset` resources.
- **Scoring** is associated with `ScoringFunction` resources.
- **Eval** is associated with `Model` and `EvalTask` resources.
- **Eval** is associated with `Model` and `Benchmark` resources.
Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.

View file

@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
```
```bash
$ llama-stack-client eval_tasks register \
$ llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
- Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
- **API Resources**: Inspect Llama Stack API resources
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.

View file

@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
- `/datasetio` + `/datasets` API
- `/scoring` + `/scoring_functions` API
- `/eval` + `/eval_tasks` API
- `/eval` + `/benchmarks` API
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- **Scoring**: evaluate outputs of the system.
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
- Associated with `EvalTask` resource.
- Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
@ -77,14 +77,14 @@ system_message = {
"content": SYSTEM_PROMPT_TEMPLATE,
}
client.eval_tasks.register(
eval_task_id="meta-reference::mmmu",
client.benchmarks.register(
benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
task_id="meta-reference::mmmu",
benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
```
```python
client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa",
client.benchmarks.register(
benchmark_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
task_id="meta-reference::simpleqa",
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@ -192,7 +192,7 @@ agent_config = {
}
response = client.eval.evaluate_rows(
task_id="meta-reference::simpleqa",
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
#### Benchmark Evaluation CLI
Usage: There are 2 inputs necessary for running a benchmark eval
- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
- `dataset_id`: the identifier associated with the dataset.
- `List[scoring_function_id]`: list of scoring function identifiers.
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
```
llama-stack-client eval run_benchmark <eval-task-id> \
--eval-task-config ~/eval_task_config.json \
--eval-task-config ~/benchmark_config.json \
--visualize
```
@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
--output-dir ./
```
#### Defining EvalTaskConfig
The `EvalTaskConfig` are user specified config to define:
#### Defining BenchmarkConfig
The `BenchmarkConfig` are user specified config to define:
1. `EvalCandidate` to run generation on:
- `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
- `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack /agents API.
2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
**Example Benchmark EvalTaskConfig**
**Example Benchmark BenchmarkConfig**
```json
{
"type": "benchmark",
@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
}
```
**Example Application EvalTaskConfig**
**Example Application BenchmarkConfig**
```json
{
"type": "app",

View file

@ -161,14 +161,14 @@ Options:
## Eval Task Management
### `llama-stack-client eval_tasks list`
### `llama-stack-client benchmarks list`
```bash
$ llama-stack-client eval_tasks list
$ llama-stack-client benchmarks list
```
### `llama-stack-client eval_tasks register`
### `llama-stack-client benchmarks register`
```bash
$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
```
Options:
@ -191,7 +191,7 @@ Options:
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
Example eval_task_config.json:
Example benchmark_config.json:
```json
{
"type": "benchmark",

View file

@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
Methods:
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
### Jobs
@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
Methods:
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
## Inspect
@ -443,20 +443,20 @@ Methods:
- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
## EvalTasks
## Benchmarks
Types:
```python
from llama_stack_client.types import (
EvalTask,
ListEvalTasksResponse,
EvalTaskListResponse,
Benchmark,
ListBenchmarksResponse,
BenchmarkListResponse,
)
```
Methods:
- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>

View file

@ -4,4 +4,4 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .eval_tasks import * # noqa: F401 F403
from .benchmarks import * # noqa: F401 F403

View file

@ -0,0 +1,86 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
class CommonBenchmarkFields(BaseModel):
dataset_id: str
scoring_functions: List[str]
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task",
)
@json_schema_type
class Benchmark(CommonBenchmarkFields, Resource):
type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
@property
def benchmark_id(self) -> str:
return self.identifier
@property
def provider_benchmark_id(self) -> str:
return self.provider_resource_id
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
benchmark_id: str
provider_id: Optional[str] = None
provider_benchmark_id: Optional[str] = None
class ListBenchmarksResponse(BaseModel):
data: List[Benchmark]
@runtime_checkable
class Benchmarks(Protocol):
@webmethod(route="/eval/benchmarks", method="GET")
async def list_benchmarks(self) -> ListBenchmarksResponse: ...
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
async def get_benchmark(
self,
benchmark_id: str,
) -> Optional[Benchmark]: ...
@webmethod(route="/eval/benchmarks", method="POST")
async def register_benchmark(
self,
benchmark_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None: ...
@webmethod(route="/eval-tasks", method="GET")
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
@webmethod(route="/eval-tasks/{task_id}", method="GET")
async def DEPRECATED_get_eval_task(
self,
eval_task_id: str,
) -> Optional[Benchmark]: ...
@webmethod(route="/eval-tasks", method="POST")
async def DEPRECATED_register_eval_task(
self,
eval_task_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None: ...

View file

@ -28,7 +28,7 @@ class Api(Enum):
vector_dbs = "vector_dbs"
datasets = "datasets"
scoring_functions = "scoring_functions"
eval_tasks = "eval_tasks"
benchmarks = "benchmarks"
tool_groups = "tool_groups"
# built-in API

View file

@ -38,19 +38,9 @@ EvalCandidate = register_schema(
@json_schema_type
class BenchmarkEvalTaskConfig(BaseModel):
class BenchmarkConfig(BaseModel):
type: Literal["benchmark"] = "benchmark"
eval_candidate: EvalCandidate
num_examples: Optional[int] = Field(
description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
default=None,
)
@json_schema_type
class AppEvalTaskConfig(BaseModel):
type: Literal["app"] = "app"
eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run",
default_factory=dict,
@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
# we could optinally add any specific dataset config here
EvalTaskConfig = register_schema(
Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
name="EvalTaskConfig",
)
@json_schema_type
class EvaluateResponse(BaseModel):
generations: List[Dict[str, Any]]
@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
class Eval(Protocol):
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
async def run_eval(
self,
benchmark_id: str,
task_config: BenchmarkConfig,
) -> Job: ...
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
async def evaluate_rows(
self,
benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: EvalTaskConfig,
task_config: BenchmarkConfig,
) -> Job: ...
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
async def evaluate_rows(
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: EvalTaskConfig,
task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...

View file

@ -1,66 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
class CommonEvalTaskFields(BaseModel):
dataset_id: str
scoring_functions: List[str]
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Metadata for this evaluation task",
)
@json_schema_type
class EvalTask(CommonEvalTaskFields, Resource):
type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
@property
def eval_task_id(self) -> str:
return self.identifier
@property
def provider_eval_task_id(self) -> str:
return self.provider_resource_id
class EvalTaskInput(CommonEvalTaskFields, BaseModel):
eval_task_id: str
provider_id: Optional[str] = None
provider_eval_task_id: Optional[str] = None
class ListEvalTasksResponse(BaseModel):
data: List[EvalTask]
@runtime_checkable
class EvalTasks(Protocol):
@webmethod(route="/eval-tasks", method="GET")
async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
async def get_eval_task(
self,
eval_task_id: str,
) -> Optional[EvalTask]: ...
@webmethod(route="/eval-tasks", method="POST")
async def register_eval_task(
self,
eval_task_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_eval_task_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None: ...

View file

@ -15,7 +15,7 @@ class ResourceType(Enum):
vector_db = "vector_db"
dataset = "dataset"
scoring_function = "scoring_function"
eval_task = "eval_task"
benchmark = "benchmark"
tool = "tool"
tool_group = "tool_group"

View file

@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset, DatasetInput
from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
from llama_stack.apis.inference import Inference
from llama_stack.apis.models import Model, ModelInput
from llama_stack.apis.safety import Safety
@ -37,7 +37,7 @@ RoutableObject = Union[
VectorDB,
Dataset,
ScoringFn,
EvalTask,
Benchmark,
Tool,
ToolGroup,
]
@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[
VectorDB,
Dataset,
ScoringFn,
EvalTask,
Benchmark,
Tool,
ToolGroup,
],
@ -173,7 +173,7 @@ a default SQLite store will be used.""",
vector_dbs: List[VectorDBInput] = Field(default_factory=list)
datasets: List[DatasetInput] = Field(default_factory=list)
scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
benchmarks: List[BenchmarkInput] = Field(default_factory=list)
tool_groups: List[ToolGroupInput] = Field(default_factory=list)
server: ServerConfig = Field(

View file

@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
router_api=Api.scoring,
),
AutoRoutedApiInfo(
routing_table_api=Api.eval_tasks,
routing_table_api=Api.benchmarks,
router_api=Api.eval,
),
AutoRoutedApiInfo(

View file

@ -9,10 +9,10 @@ import logging
from typing import Any, Dict, List, Set
from llama_stack.apis.agents import Agents
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models
@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.datatypes import (
Api,
BenchmarksProtocolPrivate,
DatasetsProtocolPrivate,
EvalTasksProtocolPrivate,
InlineProviderSpec,
ModelsProtocolPrivate,
ProviderSpec,
@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
Api.scoring: Scoring,
Api.scoring_functions: ScoringFunctions,
Api.eval: Eval,
Api.eval_tasks: EvalTasks,
Api.benchmarks: Benchmarks,
Api.post_training: PostTraining,
Api.tool_groups: ToolGroups,
Api.tool_runtime: ToolRuntime,
@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
ScoringFunctions,
Api.scoring_functions,
),
Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
}

View file

@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry
from llama_stack.providers.datatypes import Api, RoutingTable
from .routing_tables import (
BenchmarksRoutingTable,
DatasetsRoutingTable,
EvalTasksRoutingTable,
ModelsRoutingTable,
ScoringFunctionsRoutingTable,
ShieldsRoutingTable,
@ -33,7 +33,7 @@ async def get_routing_table_impl(
"shields": ShieldsRoutingTable,
"datasets": DatasetsRoutingTable,
"scoring_functions": ScoringFunctionsRoutingTable,
"eval_tasks": EvalTasksRoutingTable,
"benchmarks": BenchmarksRoutingTable,
"tool_groups": ToolGroupsRoutingTable,
}

View file

@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
from llama_stack.apis.eval import (
AppEvalTaskConfig,
BenchmarkConfig,
Eval,
EvalTaskConfig,
EvaluateResponse,
Job,
JobStatus,
@ -347,23 +346,23 @@ class EvalRouter(Eval):
async def run_eval(
self,
task_id: str,
task_config: AppEvalTaskConfig,
benchmark_id: str,
task_config: BenchmarkConfig,
) -> Job:
return await self.routing_table.get_provider_impl(task_id).run_eval(
task_id=task_id,
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
benchmark_id=benchmark_id,
task_config=task_config,
)
async def evaluate_rows(
self,
task_id: str,
benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: EvalTaskConfig,
task_config: BenchmarkConfig,
) -> EvaluateResponse:
return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
task_id=task_id,
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
benchmark_id=benchmark_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
@ -371,30 +370,72 @@ class EvalRouter(Eval):
async def job_status(
self,
task_id: str,
benchmark_id: str,
job_id: str,
) -> Optional[JobStatus]:
return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
async def job_cancel(
self,
task_id: str,
benchmark_id: str,
job_id: str,
) -> None:
await self.routing_table.get_provider_impl(task_id).job_cancel(
task_id,
await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
benchmark_id,
job_id,
)
async def job_result(
self,
benchmark_id: str,
job_id: str,
) -> EvaluateResponse:
return await self.routing_table.get_provider_impl(benchmark_id).job_result(
benchmark_id,
job_id,
)
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job:
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse:
return await self.evaluate_rows(
benchmark_id=task_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
)
async def DEPRECATED_job_status(
self,
task_id: str,
job_id: str,
) -> Optional[JobStatus]:
return await self.job_status(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_cancel(
self,
task_id: str,
job_id: str,
) -> None:
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_result(
self,
task_id: str,
job_id: str,
) -> EvaluateResponse:
return await self.routing_table.get_provider_impl(task_id).job_result(
task_id,
job_id,
)
return await self.job_result(benchmark_id=task_id, job_id=job_id)
class ToolRuntimeRouter(ToolRuntime):

View file

@ -4,14 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import logging
from typing import Any, Dict, List, Optional
from pydantic import TypeAdapter
from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ParamType
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
from llama_stack.apis.resource import ResourceType
from llama_stack.apis.scoring_functions import (
@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import (
from llama_stack.distribution.store import DistributionRegistry
from llama_stack.providers.datatypes import Api, RoutingTable
logger = logging.getLogger(__name__)
def get_impl_api(p: Any) -> Api:
return p.__provider_spec__.api
@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
elif api == Api.scoring:
return await p.register_scoring_function(obj)
elif api == Api.eval:
return await p.register_eval_task(obj)
return await p.register_benchmark(obj)
elif api == Api.tool_runtime:
return await p.register_tool(obj)
else:
@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable):
scoring_functions = await p.list_scoring_functions()
await add_objects(scoring_functions, pid, ScoringFn)
elif api == Api.eval:
p.eval_task_store = self
p.benchmark_store = self
elif api == Api.tool_runtime:
p.tool_store = self
@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable):
return ("DatasetIO", "dataset")
elif isinstance(self, ScoringFunctionsRoutingTable):
return ("Scoring", "scoring_function")
elif isinstance(self, EvalTasksRoutingTable):
return ("Eval", "eval_task")
elif isinstance(self, BenchmarksRoutingTable):
return ("Eval", "benchmark")
elif isinstance(self, ToolGroupsRoutingTable):
return ("Tools", "tool")
else:
@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
await self.register_object(scoring_fn)
class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
async def list_eval_tasks(self) -> ListEvalTasksResponse:
return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
async def list_benchmarks(self) -> ListBenchmarksResponse:
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
return await self.get_object_by_identifier("eval_task", eval_task_id)
async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
return await self.get_object_by_identifier("benchmark", benchmark_id)
async def register_eval_task(
async def register_benchmark(
self,
eval_task_id: str,
benchmark_id: str,
dataset_id: str,
scoring_functions: List[str],
metadata: Optional[Dict[str, Any]] = None,
provider_eval_task_id: Optional[str] = None,
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
) -> None:
if metadata is None:
@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
raise ValueError(
"No provider specified and multiple providers available. Please specify a provider_id."
)
if provider_eval_task_id is None:
provider_eval_task_id = eval_task_id
eval_task = EvalTask(
identifier=eval_task_id,
if provider_benchmark_id is None:
provider_benchmark_id = benchmark_id
benchmark = Benchmark(
identifier=benchmark_id,
dataset_id=dataset_id,
scoring_functions=scoring_functions,
metadata=metadata,
provider_id=provider_id,
provider_resource_id=provider_eval_task_id,
provider_resource_id=provider_benchmark_id,
)
await self.register_object(benchmark)
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.list_benchmarks()
async def DEPRECATED_get_eval_task(
self,
eval_task_id: str,
) -> Optional[Benchmark]:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.get_benchmark(eval_task_id)
async def DEPRECATED_register_eval_task(
self,
eval_task_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.register_benchmark(
benchmark_id=eval_task_id,
dataset_id=dataset_id,
scoring_functions=scoring_functions,
metadata=metadata,
provider_benchmark_id=provider_benchmark_id,
)
await self.register_object(eval_task)
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):

View file

@ -15,10 +15,10 @@ from termcolor import colored
from llama_stack.apis.agents import Agents
from llama_stack.apis.batch_inference import BatchInference
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models
@ -53,7 +53,7 @@ class LlamaStack(
PostTraining,
VectorIO,
Eval,
EvalTasks,
Benchmarks,
Scoring,
ScoringFunctions,
DatasetIO,
@ -78,7 +78,7 @@ RESOURCES = [
"register_scoring_function",
"list_scoring_functions",
),
("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
]

View file

@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
```
```bash
$ llama-stack-client eval_tasks register \
$ llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \

View file

@ -8,12 +8,12 @@ import streamlit as st
from modules.api import llama_stack_api
def eval_tasks():
# Eval Tasks Section
st.header("Eval Tasks")
def benchmarks():
# Benchmarks Section
st.header("Benchmarks")
eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
if len(eval_tasks_info) > 0:
selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
st.json(eval_tasks_info[selected_eval_task], expanded=True)
if len(benchmarks_info) > 0:
selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
st.json(benchmarks_info[selected_benchmark], expanded=True)

View file

@ -4,8 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from page.distribution.benchmarks import benchmarks
from page.distribution.datasets import datasets
from page.distribution.eval_tasks import eval_tasks
from page.distribution.models import models
from page.distribution.scoring_functions import scoring_functions
from page.distribution.shields import shields
@ -20,7 +20,7 @@ def resources_page():
"Shields",
"Scoring Functions",
"Datasets",
"Eval Tasks",
"Benchmarks",
]
icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
selected_resource = option_menu(
@ -34,8 +34,8 @@ def resources_page():
},
},
)
if selected_resource == "Eval Tasks":
eval_tasks()
if selected_resource == "Benchmarks":
benchmarks()
elif selected_resource == "Vector Databases":
vector_dbs()
elif selected_resource == "Datasets":

View file

@ -11,28 +11,28 @@ import streamlit as st
from modules.api import llama_stack_api
def select_eval_task_1():
# Select Eval Tasks
def select_benchmark_1():
# Select Benchmarks
st.subheader("1. Choose An Eval Task")
eval_tasks = llama_stack_api.client.eval_tasks.list()
eval_tasks = {et.identifier: et for et in eval_tasks}
eval_tasks_names = list(eval_tasks.keys())
selected_eval_task = st.selectbox(
benchmarks = llama_stack_api.client.benchmarks.list()
benchmarks = {et.identifier: et for et in benchmarks}
benchmarks_names = list(benchmarks.keys())
selected_benchmark = st.selectbox(
"Choose an eval task.",
options=eval_tasks_names,
options=benchmarks_names,
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
)
with st.expander("View Eval Task"):
st.json(eval_tasks[selected_eval_task], expanded=True)
st.json(benchmarks[selected_benchmark], expanded=True)
st.session_state["selected_eval_task"] = selected_eval_task
st.session_state["eval_tasks"] = eval_tasks
st.session_state["selected_benchmark"] = selected_benchmark
st.session_state["benchmarks"] = benchmarks
if st.button("Confirm", key="confirm_1"):
st.session_state["selected_eval_task_1_next"] = True
st.session_state["selected_benchmark_1_next"] = True
def define_eval_candidate_2():
if not st.session_state.get("selected_eval_task_1_next", None):
if not st.session_state.get("selected_benchmark_1_next", None):
return
st.subheader("2. Define Eval Candidate")
@ -161,11 +161,11 @@ def run_evaluation_3():
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
"""
)
selected_eval_task = st.session_state["selected_eval_task"]
eval_tasks = st.session_state["eval_tasks"]
selected_benchmark = st.session_state["selected_benchmark"]
benchmarks = st.session_state["benchmarks"]
eval_candidate = st.session_state["eval_candidate"]
dataset_id = eval_tasks[selected_eval_task].dataset_id
dataset_id = benchmarks[selected_benchmark].dataset_id
rows = llama_stack_api.client.datasetio.get_rows_paginated(
dataset_id=dataset_id,
rows_in_page=-1,
@ -180,16 +180,16 @@ def run_evaluation_3():
help="Number of examples from the dataset to evaluate. ",
)
eval_task_config = {
benchmark_config = {
"type": "benchmark",
"eval_candidate": eval_candidate,
"scoring_params": {},
}
with st.expander("View Evaluation Task", expanded=True):
st.json(eval_tasks[selected_eval_task], expanded=True)
st.json(benchmarks[selected_benchmark], expanded=True)
with st.expander("View Evaluation Task Configuration", expanded=True):
st.json(eval_task_config, expanded=True)
st.json(benchmark_config, expanded=True)
# Add run button and handle evaluation
if st.button("Run Evaluation"):
@ -209,10 +209,10 @@ def run_evaluation_3():
progress_bar.progress(progress, text=progress_text)
# Run evaluation for current row
eval_res = llama_stack_api.client.eval.evaluate_rows(
task_id=selected_eval_task,
benchmark_id=selected_benchmark,
input_rows=[r],
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
task_config=eval_task_config,
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
task_config=benchmark_config,
)
for k in r.keys():
@ -225,7 +225,7 @@ def run_evaluation_3():
output_res[k] = []
output_res[k].append(eval_res.generations[0][k])
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
if scoring_fn not in output_res:
output_res[scoring_fn] = []
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
@ -245,7 +245,7 @@ def native_evaluation_page():
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
st.title("📊 Evaluations (Generation + Scoring)")
select_eval_task_1()
select_benchmark_1()
define_eval_candidate_2()
run_evaluation_3()

View file

@ -10,9 +10,9 @@ from urllib.parse import urlparse
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasets import Dataset
from llama_stack.apis.datatypes import Api
from llama_stack.apis.eval_tasks import EvalTask
from llama_stack.apis.models import Model
from llama_stack.apis.scoring_functions import ScoringFn
from llama_stack.apis.shields import Shield
@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
class EvalTasksProtocolPrivate(Protocol):
async def register_eval_task(self, eval_task: EvalTask) -> None: ...
class BenchmarksProtocolPrivate(Protocol):
async def register_benchmark(self, benchmark: Benchmark) -> None: ...
class ToolsProtocolPrivate(Protocol):

View file

@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
from tqdm import tqdm
from llama_stack.apis.agents import Agents, StepType
from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval_tasks import EvalTask
from llama_stack.apis.inference import Inference, UserMessage
from llama_stack.apis.scoring import Scoring
from llama_stack.distribution.datatypes import Api
from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
MEMORY_QUERY_TOOL,
)
@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
from llama_stack.providers.utils.kvstore import kvstore_impl
from .....apis.common.job_types import Job
from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
from .config import MetaReferenceEvalConfig
EVAL_TASKS_PREFIX = "eval_tasks:"
EVAL_TASKS_PREFIX = "benchmarks:"
class MetaReferenceEvalImpl(
Eval,
EvalTasksProtocolPrivate,
BenchmarksProtocolPrivate,
):
def __init__(
self,
@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
# TODO: assume sync job, will need jobs API for async scheduling
self.jobs = {}
self.eval_tasks = {}
self.benchmarks = {}
async def initialize(self) -> None:
self.kvstore = await kvstore_impl(self.config.kvstore)
# Load existing eval_tasks from kvstore
# Load existing benchmarks from kvstore
start_key = EVAL_TASKS_PREFIX
end_key = f"{EVAL_TASKS_PREFIX}\xff"
stored_eval_tasks = await self.kvstore.range(start_key, end_key)
stored_benchmarks = await self.kvstore.range(start_key, end_key)
for eval_task in stored_eval_tasks:
eval_task = EvalTask.model_validate_json(eval_task)
self.eval_tasks[eval_task.identifier] = eval_task
for benchmark in stored_benchmarks:
benchmark = Benchmark.model_validate_json(benchmark)
self.benchmarks[benchmark.identifier] = benchmark
async def shutdown(self) -> None: ...
async def register_eval_task(self, task_def: EvalTask) -> None:
async def register_benchmark(self, task_def: Benchmark) -> None:
# Store in kvstore
key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
await self.kvstore.set(
key=key,
value=task_def.model_dump_json(),
)
self.eval_tasks[task_def.identifier] = task_def
self.benchmarks[task_def.identifier] = task_def
async def run_eval(
self,
task_id: str,
task_config: EvalTaskConfig,
benchmark_id: str,
task_config: BenchmarkConfig,
) -> Job:
task_def = self.eval_tasks[task_id]
task_def = self.benchmarks[benchmark_id]
dataset_id = task_def.dataset_id
candidate = task_config.eval_candidate
scoring_functions = task_def.scoring_functions
@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
)
res = await self.evaluate_rows(
task_id=task_id,
benchmark_id=benchmark_id,
input_rows=all_rows.rows,
scoring_functions=scoring_functions,
task_config=task_config,
@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
return Job(job_id=job_id)
async def _run_agent_generation(
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
) -> List[Dict[str, Any]]:
candidate = task_config.eval_candidate
create_response = await self.agents_api.create_agent(candidate.config)
@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
return generations
async def _run_model_generation(
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
) -> List[Dict[str, Any]]:
candidate = task_config.eval_candidate
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(
async def evaluate_rows(
self,
task_id: str,
benchmark_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: EvalTaskConfig,
task_config: BenchmarkConfig,
) -> EvaluateResponse:
candidate = task_config.eval_candidate
if candidate.type == "agent":
@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
# scoring with generated_answer
score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
if task_config.type == "app" and task_config.scoring_params is not None:
if task_config.scoring_params is not None:
scoring_functions_dict = {
scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
for scoring_fn_id in scoring_functions
@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(
return EvaluateResponse(generations=generations, scores=score_response.results)
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
if job_id in self.jobs:
return JobStatus.completed
return None
async def job_cancel(self, task_id: str, job_id: str) -> None:
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
raise NotImplementedError("Job cancel is not implemented yet")
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
status = await self.job_status(task_id, job_id)
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
status = await self.job_status(benchmark_id, job_id)
if not status or status != JobStatus.completed:
raise ValueError(f"Job is not completed, Status: {status.value}")
return self.jobs[job_id]
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job:
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse:
return await self.evaluate_rows(
benchmark_id=task_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
)
async def DEPRECATED_job_status(
self,
task_id: str,
job_id: str,
) -> Optional[JobStatus]:
return await self.job_status(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_cancel(
self,
task_id: str,
job_id: str,
) -> None:
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_result(
self,
task_id: str,
job_id: str,
) -> EvaluateResponse:
return await self.job_result(benchmark_id=task_id, job_id=job_id)

View file

@ -10,8 +10,8 @@ import pytest
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
from llama_stack.apis.eval.eval import (
AppEvalTaskConfig,
BenchmarkEvalTaskConfig,
AppBenchmarkConfig,
BenchmarkBenchmarkConfig,
ModelCandidate,
)
from llama_stack.apis.inference import SamplingParams
@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT
class Testeval:
@pytest.mark.asyncio
async def test_eval_tasks_list(self, eval_stack):
async def test_benchmarks_list(self, eval_stack):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
eval_tasks_impl = eval_stack[Api.eval_tasks]
response = await eval_tasks_impl.list_eval_tasks()
benchmarks_impl = eval_stack[Api.benchmarks]
response = await benchmarks_impl.list_benchmarks()
assert isinstance(response, list)
@pytest.mark.asyncio
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
eval_stack[Api.eval_tasks],
eval_stack[Api.benchmarks],
eval_stack[Api.datasetio],
eval_stack[Api.datasets],
eval_stack[Api.models],
@ -59,17 +59,17 @@ class Testeval:
scoring_functions = [
"basic::equality",
]
task_id = "meta-reference::app_eval"
await eval_tasks_impl.register_eval_task(
eval_task_id=task_id,
benchmark_id = "meta-reference::app_eval"
await benchmarks_impl.register_benchmark(
benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.evaluate_rows(
task_id=task_id,
benchmark_id=benchmark_id,
input_rows=rows.rows,
scoring_functions=scoring_functions,
task_config=AppEvalTaskConfig(
task_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
@ -92,9 +92,9 @@ class Testeval:
@pytest.mark.asyncio
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
eval_stack[Api.eval_tasks],
eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
eval_stack[Api.models],
)
@ -105,15 +105,15 @@ class Testeval:
"basic::subset_of",
]
task_id = "meta-reference::app_eval-2"
await eval_tasks_impl.register_eval_task(
eval_task_id=task_id,
benchmark_id = "meta-reference::app_eval-2"
await benchmarks_impl.register_benchmark(
benchmark_id=benchmark_id,
dataset_id="test_dataset_for_eval",
scoring_functions=scoring_functions,
)
response = await eval_impl.run_eval(
task_id=task_id,
task_config=AppEvalTaskConfig(
benchmark_id=benchmark_id,
task_config=AppBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),
@ -121,9 +121,9 @@ class Testeval:
),
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(task_id, response.job_id)
job_status = await eval_impl.job_status(benchmark_id, response.job_id)
assert job_status and job_status.value == "completed"
eval_response = await eval_impl.job_result(task_id, response.job_id)
eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
assert eval_response is not None
assert len(eval_response.generations) == 5
@ -131,9 +131,9 @@ class Testeval:
@pytest.mark.asyncio
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
eval_stack[Api.eval],
eval_stack[Api.eval_tasks],
eval_stack[Api.benchmarks],
eval_stack[Api.datasets],
eval_stack[Api.models],
)
@ -159,20 +159,20 @@ class Testeval:
)
# register eval task
await eval_tasks_impl.register_eval_task(
eval_task_id="meta-reference-mmlu",
await benchmarks_impl.register_benchmark(
benchmark_id="meta-reference-mmlu",
dataset_id="mmlu",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
# list benchmarks
response = await eval_tasks_impl.list_eval_tasks()
response = await benchmarks_impl.list_benchmarks()
assert len(response) > 0
benchmark_id = "meta-reference-mmlu"
response = await eval_impl.run_eval(
task_id=benchmark_id,
task_config=BenchmarkEvalTaskConfig(
benchmark_id=benchmark_id,
task_config=BenchmarkBenchmarkConfig(
eval_candidate=ModelCandidate(
model=inference_model,
sampling_params=SamplingParams(),

View file

@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional
from pydantic import BaseModel
from llama_stack.apis.benchmarks import BenchmarkInput
from llama_stack.apis.datasets import DatasetInput
from llama_stack.apis.eval_tasks import EvalTaskInput
from llama_stack.apis.models import ModelInput
from llama_stack.apis.scoring_functions import ScoringFnInput
from llama_stack.apis.shields import ShieldInput
@ -42,7 +42,7 @@ async def construct_stack_for_test(
vector_dbs: Optional[List[VectorDBInput]] = None,
datasets: Optional[List[DatasetInput]] = None,
scoring_fns: Optional[List[ScoringFnInput]] = None,
eval_tasks: Optional[List[EvalTaskInput]] = None,
benchmarks: Optional[List[BenchmarkInput]] = None,
tool_groups: Optional[List[ToolGroupInput]] = None,
) -> TestStack:
sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
@ -56,7 +56,7 @@ async def construct_stack_for_test(
vector_dbs=vector_dbs or [],
datasets=datasets or [],
scoring_fns=scoring_fns or [],
eval_tasks=eval_tasks or [],
benchmarks=benchmarks or [],
tool_groups=tool_groups or [],
)
run_config = parse_and_maybe_upgrade_config(run_config)

View file

@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -108,7 +108,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: brave-search

View file

@ -99,7 +99,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: brave-search

View file

@ -85,4 +85,4 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []

View file

@ -164,7 +164,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -153,7 +153,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -116,7 +116,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -106,7 +106,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -116,7 +116,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -106,7 +106,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -139,7 +139,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -113,7 +113,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -110,7 +110,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -107,7 +107,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -118,7 +118,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -106,7 +106,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -105,7 +105,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -159,7 +159,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -148,7 +148,7 @@ shields:
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search

View file

@ -109,7 +109,7 @@ shields: []
vector_dbs: []
datasets: []
scoring_fns: []
eval_tasks: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search