mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
fix!: update eval-tasks -> benchmarks (#1032)
# What does this PR do? - Update `/eval-tasks` to `/benchmarks` - ⚠️ Remove differentiation between `app` v.s. `benchmark` eval task config. Now we only have `BenchmarkConfig`. The overloaded `benchmark` is confusing and do not add any value. Backward compatibility is being kept as the "type" is not being used anywhere. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan - This change is backward compatible - Run notebook test with ``` pytest -v -s --nbval-lax ./docs/getting_started.ipynb pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="846" alt="image" src="https://github.com/user-attachments/assets/d2fc06a7-593a-444f-bc1f-10ab9b0c843d" /> [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --------- Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Signed-off-by: Ben Browning <bbrownin@redhat.com> Signed-off-by: Sébastien Han <seb@redhat.com> Signed-off-by: reidliu <reid201711@gmail.com> Co-authored-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Co-authored-by: Ben Browning <ben324@gmail.com> Co-authored-by: Sébastien Han <seb@redhat.com> Co-authored-by: Reid <61492567+reidliu41@users.noreply.github.com> Co-authored-by: reidliu <reid201711@gmail.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com>
This commit is contained in:
parent
225dd38e5c
commit
8b655e3cd2
60 changed files with 2622 additions and 1910 deletions
2286
docs/_static/llama-stack-spec.html
vendored
2286
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1465
docs/_static/llama-stack-spec.yaml
vendored
1465
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -324,7 +324,7 @@
|
||||||
"- vector_io\n",
|
"- vector_io\n",
|
||||||
"container_image: null\n",
|
"container_image: null\n",
|
||||||
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
||||||
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
|
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||||
"image_name: together\n",
|
"image_name: together\n",
|
||||||
"metadata_store:\n",
|
"metadata_store:\n",
|
||||||
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
|
" db_path: <span style=\"color: #800080; text-decoration-color: #800080\">/Users/ashwin/.llama/distributions/together/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">registry.db</span>\n",
|
||||||
|
@ -508,7 +508,7 @@
|
||||||
"- vector_io\n",
|
"- vector_io\n",
|
||||||
"container_image: null\n",
|
"container_image: null\n",
|
||||||
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||||
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||||
"image_name: together\n",
|
"image_name: together\n",
|
||||||
"metadata_store:\n",
|
"metadata_store:\n",
|
||||||
" db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
|
" db_path: \u001b[35m/Users/ashwin/.llama/distributions/together/\u001b[0m\u001b[95mregistry.db\u001b[0m\n",
|
||||||
|
|
|
@ -370,7 +370,7 @@
|
||||||
"- tool_runtime\n",
|
"- tool_runtime\n",
|
||||||
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
"datasets: <span style=\"font-weight: bold\">[]</span>\n",
|
||||||
"container_image: null\n",
|
"container_image: null\n",
|
||||||
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
|
"benchmarks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||||
"image_name: together\n",
|
"image_name: together\n",
|
||||||
"memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
|
"memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
|
||||||
"metadata_store:\n",
|
"metadata_store:\n",
|
||||||
|
@ -551,7 +551,7 @@
|
||||||
"- tool_runtime\n",
|
"- tool_runtime\n",
|
||||||
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||||
"container_image: null\n",
|
"container_image: null\n",
|
||||||
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
"benchmarks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||||
"image_name: together\n",
|
"image_name: together\n",
|
||||||
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
|
||||||
"metadata_store:\n",
|
"metadata_store:\n",
|
||||||
|
|
|
@ -647,6 +647,7 @@ class Generator:
|
||||||
description = "\n".join(
|
description = "\n".join(
|
||||||
filter(None, [doc_string.short_description, doc_string.long_description])
|
filter(None, [doc_string.short_description, doc_string.long_description])
|
||||||
)
|
)
|
||||||
|
|
||||||
return Operation(
|
return Operation(
|
||||||
tags=[op.defining_class.__name__],
|
tags=[op.defining_class.__name__],
|
||||||
summary=None,
|
summary=None,
|
||||||
|
@ -656,6 +657,7 @@ class Generator:
|
||||||
requestBody=requestBody,
|
requestBody=requestBody,
|
||||||
responses=responses,
|
responses=responses,
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
|
deprecated=True if "DEPRECATED" in op.func_name else None,
|
||||||
security=[] if op.public else None,
|
security=[] if op.public else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -117,6 +117,7 @@ class Operation:
|
||||||
requestBody: Optional[RequestBody] = None
|
requestBody: Optional[RequestBody] = None
|
||||||
callbacks: Optional[Dict[str, "Callback"]] = None
|
callbacks: Optional[Dict[str, "Callback"]] = None
|
||||||
security: Optional[List["SecurityRequirement"]] = None
|
security: Optional[List["SecurityRequirement"]] = None
|
||||||
|
deprecated: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
@ -41,14 +41,14 @@ system_message = {
|
||||||
"content": SYSTEM_PROMPT_TEMPLATE,
|
"content": SYSTEM_PROMPT_TEMPLATE,
|
||||||
}
|
}
|
||||||
|
|
||||||
client.eval_tasks.register(
|
client.benchmarks.register(
|
||||||
eval_task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
dataset_id=f"mmmu-{subset}-{split}",
|
dataset_id=f"mmmu-{subset}-{split}",
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
input_rows=eval_rows,
|
input_rows=eval_rows,
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -99,14 +99,14 @@ eval_rows = client.datasetio.get_rows_paginated(
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
client.eval_tasks.register(
|
client.benchmarks.register(
|
||||||
eval_task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
dataset_id=simpleqa_dataset_id,
|
dataset_id=simpleqa_dataset_id,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -156,7 +156,7 @@ agent_config = {
|
||||||
}
|
}
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
|
|
@ -10,15 +10,15 @@ Here's how to set up basic evaluation:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Create an evaluation task
|
# Create an evaluation task
|
||||||
response = client.eval_tasks.register(
|
response = client.benchmarks.register(
|
||||||
eval_task_id="my_eval",
|
benchmark_id="my_eval",
|
||||||
dataset_id="my_dataset",
|
dataset_id="my_dataset",
|
||||||
scoring_functions=["accuracy", "relevance"],
|
scoring_functions=["accuracy", "relevance"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run evaluation
|
# Run evaluation
|
||||||
job = client.eval.run_eval(
|
job = client.eval.run_eval(
|
||||||
task_id="my_eval",
|
benchmark_id="my_eval",
|
||||||
task_config={
|
task_config={
|
||||||
"type": "app",
|
"type": "app",
|
||||||
"eval_candidate": {"type": "agent", "config": agent_config},
|
"eval_candidate": {"type": "agent", "config": agent_config},
|
||||||
|
@ -26,5 +26,5 @@ job = client.eval.run_eval(
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get results
|
# Get results
|
||||||
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
|
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
|
||||||
```
|
```
|
||||||
|
|
|
@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
|
||||||
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
||||||
- `/datasetio` + `/datasets` API
|
- `/datasetio` + `/datasets` API
|
||||||
- `/scoring` + `/scoring_functions` API
|
- `/scoring` + `/scoring_functions` API
|
||||||
- `/eval` + `/eval_tasks` API
|
- `/eval` + `/benchmarks` API
|
||||||
|
|
||||||
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
|
||||||
- **Scoring**: evaluate outputs of the system.
|
- **Scoring**: evaluate outputs of the system.
|
||||||
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
||||||
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
||||||
- Associated with `EvalTask` resource.
|
- Associated with `Benchmark` resource.
|
||||||
|
|
||||||
|
|
||||||
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
||||||
|
|
|
@ -42,7 +42,7 @@ Some of these APIs are associated with a set of **Resources**. Here is the mappi
|
||||||
- **Tool Runtime** is associated with `ToolGroup` resources.
|
- **Tool Runtime** is associated with `ToolGroup` resources.
|
||||||
- **DatasetIO** is associated with `Dataset` resources.
|
- **DatasetIO** is associated with `Dataset` resources.
|
||||||
- **Scoring** is associated with `ScoringFunction` resources.
|
- **Scoring** is associated with `ScoringFunction` resources.
|
||||||
- **Eval** is associated with `Model` and `EvalTask` resources.
|
- **Eval** is associated with `Model` and `Benchmark` resources.
|
||||||
|
|
||||||
Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
|
Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
|
||||||
|
|
||||||
|
|
|
@ -64,7 +64,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ llama-stack-client eval_tasks register \
|
$ llama-stack-client benchmarks register \
|
||||||
--eval-task-id meta-reference-mmlu \
|
--eval-task-id meta-reference-mmlu \
|
||||||
--provider-id meta-reference \
|
--provider-id meta-reference \
|
||||||
--dataset-id mmlu \
|
--dataset-id mmlu \
|
||||||
|
@ -86,7 +86,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
||||||
- Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
|
- Under the hood, it uses Llama Stack's `/providers` API to get information about the providers.
|
||||||
|
|
||||||
- **API Resources**: Inspect Llama Stack API resources
|
- **API Resources**: Inspect Llama Stack API resources
|
||||||
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `eval_tasks`, `shields`).
|
- This page allows you to inspect Llama Stack API resources (`models`, `datasets`, `memory_banks`, `benchmarks`, `shields`).
|
||||||
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
|
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
|
||||||
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
|
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ The Llama Stack Evaluation flow allows you to run evaluations on your GenAI appl
|
||||||
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
|
||||||
- `/datasetio` + `/datasets` API
|
- `/datasetio` + `/datasets` API
|
||||||
- `/scoring` + `/scoring_functions` API
|
- `/scoring` + `/scoring_functions` API
|
||||||
- `/eval` + `/eval_tasks` API
|
- `/eval` + `/benchmarks` API
|
||||||
|
|
||||||
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
|
||||||
- **Scoring**: evaluate outputs of the system.
|
- **Scoring**: evaluate outputs of the system.
|
||||||
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
- Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
|
||||||
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
- **Eval**: generate outputs (via Inference or Agents) and perform scoring.
|
||||||
- Associated with `EvalTask` resource.
|
- Associated with `Benchmark` resource.
|
||||||
|
|
||||||
|
|
||||||
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
|
||||||
|
@ -77,14 +77,14 @@ system_message = {
|
||||||
"content": SYSTEM_PROMPT_TEMPLATE,
|
"content": SYSTEM_PROMPT_TEMPLATE,
|
||||||
}
|
}
|
||||||
|
|
||||||
client.eval_tasks.register(
|
client.benchmarks.register(
|
||||||
eval_task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
dataset_id=f"mmmu-{subset}-{split}",
|
dataset_id=f"mmmu-{subset}-{split}",
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::mmmu",
|
benchmark_id="meta-reference::mmmu",
|
||||||
input_rows=eval_rows,
|
input_rows=eval_rows,
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -135,14 +135,14 @@ eval_rows = client.datasetio.get_rows_paginated(
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
client.eval_tasks.register(
|
client.benchmarks.register(
|
||||||
eval_task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
dataset_id=simpleqa_dataset_id,
|
dataset_id=simpleqa_dataset_id,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -192,7 +192,7 @@ agent_config = {
|
||||||
}
|
}
|
||||||
|
|
||||||
response = client.eval.evaluate_rows(
|
response = client.eval.evaluate_rows(
|
||||||
task_id="meta-reference::simpleqa",
|
benchmark_id="meta-reference::simpleqa",
|
||||||
input_rows=eval_rows.rows,
|
input_rows=eval_rows.rows,
|
||||||
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
scoring_functions=["llm-as-judge::405b-simpleqa"],
|
||||||
task_config={
|
task_config={
|
||||||
|
@ -281,7 +281,7 @@ The following examples give the quick steps to start running evaluations using t
|
||||||
|
|
||||||
#### Benchmark Evaluation CLI
|
#### Benchmark Evaluation CLI
|
||||||
Usage: There are 2 inputs necessary for running a benchmark eval
|
Usage: There are 2 inputs necessary for running a benchmark eval
|
||||||
- `eval-task-id`: the identifier associated with the eval task. Each `EvalTask` is parametrized by
|
- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
|
||||||
- `dataset_id`: the identifier associated with the dataset.
|
- `dataset_id`: the identifier associated with the dataset.
|
||||||
- `List[scoring_function_id]`: list of scoring function identifiers.
|
- `List[scoring_function_id]`: list of scoring function identifiers.
|
||||||
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
|
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
|
||||||
|
@ -289,7 +289,7 @@ Usage: There are 2 inputs necessary for running a benchmark eval
|
||||||
|
|
||||||
```
|
```
|
||||||
llama-stack-client eval run_benchmark <eval-task-id> \
|
llama-stack-client eval run_benchmark <eval-task-id> \
|
||||||
--eval-task-config ~/eval_task_config.json \
|
--eval-task-config ~/benchmark_config.json \
|
||||||
--visualize
|
--visualize
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -309,15 +309,15 @@ llama-stack-client eval run_scoring <scoring_fn_id_1> <scoring_fn_id_2> ... <sco
|
||||||
--output-dir ./
|
--output-dir ./
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Defining EvalTaskConfig
|
#### Defining BenchmarkConfig
|
||||||
The `EvalTaskConfig` are user specified config to define:
|
The `BenchmarkConfig` are user specified config to define:
|
||||||
1. `EvalCandidate` to run generation on:
|
1. `EvalCandidate` to run generation on:
|
||||||
- `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
|
- `ModelCandidate`: The model will be used for generation through LlamaStack /inference API.
|
||||||
- `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack /agents API.
|
- `AgentCandidate`: The agentic system specified by AgentConfig will be used for generation through LlamaStack /agents API.
|
||||||
2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
|
2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
|
||||||
|
|
||||||
|
|
||||||
**Example Benchmark EvalTaskConfig**
|
**Example Benchmark BenchmarkConfig**
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"type": "benchmark",
|
"type": "benchmark",
|
||||||
|
@ -335,7 +335,7 @@ The `EvalTaskConfig` are user specified config to define:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Example Application EvalTaskConfig**
|
**Example Application BenchmarkConfig**
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"type": "app",
|
"type": "app",
|
||||||
|
|
|
@ -161,14 +161,14 @@ Options:
|
||||||
|
|
||||||
## Eval Task Management
|
## Eval Task Management
|
||||||
|
|
||||||
### `llama-stack-client eval_tasks list`
|
### `llama-stack-client benchmarks list`
|
||||||
```bash
|
```bash
|
||||||
$ llama-stack-client eval_tasks list
|
$ llama-stack-client benchmarks list
|
||||||
```
|
```
|
||||||
|
|
||||||
### `llama-stack-client eval_tasks register`
|
### `llama-stack-client benchmarks register`
|
||||||
```bash
|
```bash
|
||||||
$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
$ llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||||
```
|
```
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
|
@ -191,7 +191,7 @@ Options:
|
||||||
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
|
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
|
||||||
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
|
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
|
||||||
|
|
||||||
Example eval_task_config.json:
|
Example benchmark_config.json:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"type": "benchmark",
|
"type": "benchmark",
|
||||||
|
|
|
@ -181,8 +181,8 @@ from llama_stack_client.types import EvaluateResponse, Job
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
- <code title="post /v1/eval/tasks/{benchmark_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||||
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
- <code title="post /v1/eval/tasks/{benchmark_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(benchmark_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
|
||||||
|
|
||||||
### Jobs
|
### Jobs
|
||||||
|
|
||||||
|
@ -194,9 +194,9 @@ from llama_stack_client.types.eval import JobStatusResponse
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, benchmark_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
|
||||||
- <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
|
- <code title="delete /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, benchmark_id) -> None</code>
|
||||||
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
|
- <code title="get /v1/eval/tasks/{benchmark_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, benchmark_id) -> Optional[JobStatusResponse]</code>
|
||||||
|
|
||||||
## Inspect
|
## Inspect
|
||||||
|
|
||||||
|
@ -443,20 +443,20 @@ Methods:
|
||||||
- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
|
- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
|
||||||
- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
|
- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
|
||||||
|
|
||||||
## EvalTasks
|
## Benchmarks
|
||||||
|
|
||||||
Types:
|
Types:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack_client.types import (
|
from llama_stack_client.types import (
|
||||||
EvalTask,
|
Benchmark,
|
||||||
ListEvalTasksResponse,
|
ListBenchmarksResponse,
|
||||||
EvalTaskListResponse,
|
BenchmarkListResponse,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
|
|
||||||
- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
|
- <code title="get /v1/eval-tasks/{benchmark_id}">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">retrieve</a>(benchmark_id) -> <a href="./src/llama_stack_client/types/benchmark.py">Optional[Benchmark]</a></code>
|
||||||
- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
|
- <code title="get /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">list</a>() -> <a href="./src/llama_stack_client/types/benchmark_list_response.py">BenchmarkListResponse</a></code>
|
||||||
- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>
|
- <code title="post /v1/eval-tasks">client.benchmarks.<a href="./src/llama_stack_client/resources/benchmarks.py">register</a>(\*\*<a href="src/llama_stack_client/types/benchmark_register_params.py">params</a>) -> None</code>
|
||||||
|
|
|
@ -4,4 +4,4 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from .eval_tasks import * # noqa: F401 F403
|
from .benchmarks import * # noqa: F401 F403
|
86
llama_stack/apis/benchmarks/benchmarks.py
Normal file
86
llama_stack/apis/benchmarks/benchmarks.py
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
from llama_models.schema_utils import json_schema_type, webmethod
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from llama_stack.apis.resource import Resource, ResourceType
|
||||||
|
|
||||||
|
|
||||||
|
class CommonBenchmarkFields(BaseModel):
|
||||||
|
dataset_id: str
|
||||||
|
scoring_functions: List[str]
|
||||||
|
metadata: Dict[str, Any] = Field(
|
||||||
|
default_factory=dict,
|
||||||
|
description="Metadata for this evaluation task",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class Benchmark(CommonBenchmarkFields, Resource):
|
||||||
|
type: Literal[ResourceType.benchmark.value] = ResourceType.benchmark.value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def benchmark_id(self) -> str:
|
||||||
|
return self.identifier
|
||||||
|
|
||||||
|
@property
|
||||||
|
def provider_benchmark_id(self) -> str:
|
||||||
|
return self.provider_resource_id
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkInput(CommonBenchmarkFields, BaseModel):
|
||||||
|
benchmark_id: str
|
||||||
|
provider_id: Optional[str] = None
|
||||||
|
provider_benchmark_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ListBenchmarksResponse(BaseModel):
|
||||||
|
data: List[Benchmark]
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Benchmarks(Protocol):
|
||||||
|
@webmethod(route="/eval/benchmarks", method="GET")
|
||||||
|
async def list_benchmarks(self) -> ListBenchmarksResponse: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
|
||||||
|
async def get_benchmark(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
) -> Optional[Benchmark]: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks", method="POST")
|
||||||
|
async def register_benchmark(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
dataset_id: str,
|
||||||
|
scoring_functions: List[str],
|
||||||
|
provider_benchmark_id: Optional[str] = None,
|
||||||
|
provider_id: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval-tasks", method="GET")
|
||||||
|
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval-tasks/{task_id}", method="GET")
|
||||||
|
async def DEPRECATED_get_eval_task(
|
||||||
|
self,
|
||||||
|
eval_task_id: str,
|
||||||
|
) -> Optional[Benchmark]: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval-tasks", method="POST")
|
||||||
|
async def DEPRECATED_register_eval_task(
|
||||||
|
self,
|
||||||
|
eval_task_id: str,
|
||||||
|
dataset_id: str,
|
||||||
|
scoring_functions: List[str],
|
||||||
|
provider_benchmark_id: Optional[str] = None,
|
||||||
|
provider_id: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None: ...
|
|
@ -28,7 +28,7 @@ class Api(Enum):
|
||||||
vector_dbs = "vector_dbs"
|
vector_dbs = "vector_dbs"
|
||||||
datasets = "datasets"
|
datasets = "datasets"
|
||||||
scoring_functions = "scoring_functions"
|
scoring_functions = "scoring_functions"
|
||||||
eval_tasks = "eval_tasks"
|
benchmarks = "benchmarks"
|
||||||
tool_groups = "tool_groups"
|
tool_groups = "tool_groups"
|
||||||
|
|
||||||
# built-in API
|
# built-in API
|
||||||
|
|
|
@ -38,19 +38,9 @@ EvalCandidate = register_schema(
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BenchmarkEvalTaskConfig(BaseModel):
|
class BenchmarkConfig(BaseModel):
|
||||||
type: Literal["benchmark"] = "benchmark"
|
type: Literal["benchmark"] = "benchmark"
|
||||||
eval_candidate: EvalCandidate
|
eval_candidate: EvalCandidate
|
||||||
num_examples: Optional[int] = Field(
|
|
||||||
description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class AppEvalTaskConfig(BaseModel):
|
|
||||||
type: Literal["app"] = "app"
|
|
||||||
eval_candidate: EvalCandidate
|
|
||||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||||
default_factory=dict,
|
default_factory=dict,
|
||||||
|
@ -62,12 +52,6 @@ class AppEvalTaskConfig(BaseModel):
|
||||||
# we could optinally add any specific dataset config here
|
# we could optinally add any specific dataset config here
|
||||||
|
|
||||||
|
|
||||||
EvalTaskConfig = register_schema(
|
|
||||||
Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
|
|
||||||
name="EvalTaskConfig",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class EvaluateResponse(BaseModel):
|
class EvaluateResponse(BaseModel):
|
||||||
generations: List[Dict[str, Any]]
|
generations: List[Dict[str, Any]]
|
||||||
|
@ -76,27 +60,52 @@ class EvaluateResponse(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class Eval(Protocol):
|
class Eval(Protocol):
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> Job: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
|
||||||
|
async def evaluate_rows(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
input_rows: List[Dict[str, Any]],
|
||||||
|
scoring_functions: List[str],
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> EvaluateResponse: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
|
||||||
|
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
|
||||||
|
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||||
|
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
||||||
|
|
||||||
|
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
||||||
|
async def DEPRECATED_run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
task_id: str,
|
||||||
task_config: EvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> Job: ...
|
) -> Job: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
||||||
async def evaluate_rows(
|
async def DEPRECATED_evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
task_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: EvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> EvaluateResponse: ...
|
) -> EvaluateResponse: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
||||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
||||||
async def job_cancel(self, task_id: str, job_id: str) -> None: ...
|
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
||||||
async def job_result(self, job_id: str, task_id: str) -> EvaluateResponse: ...
|
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
|
|
||||||
|
|
||||||
from llama_models.schema_utils import json_schema_type, webmethod
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
from llama_stack.apis.resource import Resource, ResourceType
|
|
||||||
|
|
||||||
|
|
||||||
class CommonEvalTaskFields(BaseModel):
|
|
||||||
dataset_id: str
|
|
||||||
scoring_functions: List[str]
|
|
||||||
metadata: Dict[str, Any] = Field(
|
|
||||||
default_factory=dict,
|
|
||||||
description="Metadata for this evaluation task",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class EvalTask(CommonEvalTaskFields, Resource):
|
|
||||||
type: Literal[ResourceType.eval_task.value] = ResourceType.eval_task.value
|
|
||||||
|
|
||||||
@property
|
|
||||||
def eval_task_id(self) -> str:
|
|
||||||
return self.identifier
|
|
||||||
|
|
||||||
@property
|
|
||||||
def provider_eval_task_id(self) -> str:
|
|
||||||
return self.provider_resource_id
|
|
||||||
|
|
||||||
|
|
||||||
class EvalTaskInput(CommonEvalTaskFields, BaseModel):
|
|
||||||
eval_task_id: str
|
|
||||||
provider_id: Optional[str] = None
|
|
||||||
provider_eval_task_id: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class ListEvalTasksResponse(BaseModel):
|
|
||||||
data: List[EvalTask]
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class EvalTasks(Protocol):
|
|
||||||
@webmethod(route="/eval-tasks", method="GET")
|
|
||||||
async def list_eval_tasks(self) -> ListEvalTasksResponse: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
|
|
||||||
async def get_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
) -> Optional[EvalTask]: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks", method="POST")
|
|
||||||
async def register_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
dataset_id: str,
|
|
||||||
scoring_functions: List[str],
|
|
||||||
provider_eval_task_id: Optional[str] = None,
|
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None: ...
|
|
|
@ -15,7 +15,7 @@ class ResourceType(Enum):
|
||||||
vector_db = "vector_db"
|
vector_db = "vector_db"
|
||||||
dataset = "dataset"
|
dataset = "dataset"
|
||||||
scoring_function = "scoring_function"
|
scoring_function = "scoring_function"
|
||||||
eval_task = "eval_task"
|
benchmark = "benchmark"
|
||||||
tool = "tool"
|
tool = "tool"
|
||||||
tool_group = "tool_group"
|
tool_group = "tool_group"
|
||||||
|
|
||||||
|
|
|
@ -8,10 +8,10 @@ from typing import Annotated, Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Dataset, DatasetInput
|
from llama_stack.apis.datasets import Dataset, DatasetInput
|
||||||
from llama_stack.apis.eval import Eval
|
from llama_stack.apis.eval import Eval
|
||||||
from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
|
|
||||||
from llama_stack.apis.inference import Inference
|
from llama_stack.apis.inference import Inference
|
||||||
from llama_stack.apis.models import Model, ModelInput
|
from llama_stack.apis.models import Model, ModelInput
|
||||||
from llama_stack.apis.safety import Safety
|
from llama_stack.apis.safety import Safety
|
||||||
|
@ -37,7 +37,7 @@ RoutableObject = Union[
|
||||||
VectorDB,
|
VectorDB,
|
||||||
Dataset,
|
Dataset,
|
||||||
ScoringFn,
|
ScoringFn,
|
||||||
EvalTask,
|
Benchmark,
|
||||||
Tool,
|
Tool,
|
||||||
ToolGroup,
|
ToolGroup,
|
||||||
]
|
]
|
||||||
|
@ -50,7 +50,7 @@ RoutableObjectWithProvider = Annotated[
|
||||||
VectorDB,
|
VectorDB,
|
||||||
Dataset,
|
Dataset,
|
||||||
ScoringFn,
|
ScoringFn,
|
||||||
EvalTask,
|
Benchmark,
|
||||||
Tool,
|
Tool,
|
||||||
ToolGroup,
|
ToolGroup,
|
||||||
],
|
],
|
||||||
|
@ -173,7 +173,7 @@ a default SQLite store will be used.""",
|
||||||
vector_dbs: List[VectorDBInput] = Field(default_factory=list)
|
vector_dbs: List[VectorDBInput] = Field(default_factory=list)
|
||||||
datasets: List[DatasetInput] = Field(default_factory=list)
|
datasets: List[DatasetInput] = Field(default_factory=list)
|
||||||
scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
|
scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
|
||||||
eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
|
benchmarks: List[BenchmarkInput] = Field(default_factory=list)
|
||||||
tool_groups: List[ToolGroupInput] = Field(default_factory=list)
|
tool_groups: List[ToolGroupInput] = Field(default_factory=list)
|
||||||
|
|
||||||
server: ServerConfig = Field(
|
server: ServerConfig = Field(
|
||||||
|
|
|
@ -44,7 +44,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
|
||||||
router_api=Api.scoring,
|
router_api=Api.scoring,
|
||||||
),
|
),
|
||||||
AutoRoutedApiInfo(
|
AutoRoutedApiInfo(
|
||||||
routing_table_api=Api.eval_tasks,
|
routing_table_api=Api.benchmarks,
|
||||||
router_api=Api.eval,
|
router_api=Api.eval,
|
||||||
),
|
),
|
||||||
AutoRoutedApiInfo(
|
AutoRoutedApiInfo(
|
||||||
|
|
|
@ -9,10 +9,10 @@ import logging
|
||||||
from typing import Any, Dict, List, Set
|
from typing import Any, Dict, List, Set
|
||||||
|
|
||||||
from llama_stack.apis.agents import Agents
|
from llama_stack.apis.agents import Agents
|
||||||
|
from llama_stack.apis.benchmarks import Benchmarks
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.eval import Eval
|
from llama_stack.apis.eval import Eval
|
||||||
from llama_stack.apis.eval_tasks import EvalTasks
|
|
||||||
from llama_stack.apis.inference import Inference
|
from llama_stack.apis.inference import Inference
|
||||||
from llama_stack.apis.inspect import Inspect
|
from llama_stack.apis.inspect import Inspect
|
||||||
from llama_stack.apis.models import Models
|
from llama_stack.apis.models import Models
|
||||||
|
@ -37,8 +37,8 @@ from llama_stack.distribution.store import DistributionRegistry
|
||||||
from llama_stack.distribution.utils.dynamic import instantiate_class_type
|
from llama_stack.distribution.utils.dynamic import instantiate_class_type
|
||||||
from llama_stack.providers.datatypes import (
|
from llama_stack.providers.datatypes import (
|
||||||
Api,
|
Api,
|
||||||
|
BenchmarksProtocolPrivate,
|
||||||
DatasetsProtocolPrivate,
|
DatasetsProtocolPrivate,
|
||||||
EvalTasksProtocolPrivate,
|
|
||||||
InlineProviderSpec,
|
InlineProviderSpec,
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
ProviderSpec,
|
ProviderSpec,
|
||||||
|
@ -73,7 +73,7 @@ def api_protocol_map() -> Dict[Api, Any]:
|
||||||
Api.scoring: Scoring,
|
Api.scoring: Scoring,
|
||||||
Api.scoring_functions: ScoringFunctions,
|
Api.scoring_functions: ScoringFunctions,
|
||||||
Api.eval: Eval,
|
Api.eval: Eval,
|
||||||
Api.eval_tasks: EvalTasks,
|
Api.benchmarks: Benchmarks,
|
||||||
Api.post_training: PostTraining,
|
Api.post_training: PostTraining,
|
||||||
Api.tool_groups: ToolGroups,
|
Api.tool_groups: ToolGroups,
|
||||||
Api.tool_runtime: ToolRuntime,
|
Api.tool_runtime: ToolRuntime,
|
||||||
|
@ -92,7 +92,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
|
||||||
ScoringFunctions,
|
ScoringFunctions,
|
||||||
Api.scoring_functions,
|
Api.scoring_functions,
|
||||||
),
|
),
|
||||||
Api.eval: (EvalTasksProtocolPrivate, EvalTasks, Api.eval_tasks),
|
Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,8 +11,8 @@ from llama_stack.distribution.store import DistributionRegistry
|
||||||
from llama_stack.providers.datatypes import Api, RoutingTable
|
from llama_stack.providers.datatypes import Api, RoutingTable
|
||||||
|
|
||||||
from .routing_tables import (
|
from .routing_tables import (
|
||||||
|
BenchmarksRoutingTable,
|
||||||
DatasetsRoutingTable,
|
DatasetsRoutingTable,
|
||||||
EvalTasksRoutingTable,
|
|
||||||
ModelsRoutingTable,
|
ModelsRoutingTable,
|
||||||
ScoringFunctionsRoutingTable,
|
ScoringFunctionsRoutingTable,
|
||||||
ShieldsRoutingTable,
|
ShieldsRoutingTable,
|
||||||
|
@ -33,7 +33,7 @@ async def get_routing_table_impl(
|
||||||
"shields": ShieldsRoutingTable,
|
"shields": ShieldsRoutingTable,
|
||||||
"datasets": DatasetsRoutingTable,
|
"datasets": DatasetsRoutingTable,
|
||||||
"scoring_functions": ScoringFunctionsRoutingTable,
|
"scoring_functions": ScoringFunctionsRoutingTable,
|
||||||
"eval_tasks": EvalTasksRoutingTable,
|
"benchmarks": BenchmarksRoutingTable,
|
||||||
"tool_groups": ToolGroupsRoutingTable,
|
"tool_groups": ToolGroupsRoutingTable,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,9 +9,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional
|
||||||
from llama_stack.apis.common.content_types import URL, InterleavedContent
|
from llama_stack.apis.common.content_types import URL, InterleavedContent
|
||||||
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
|
from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
|
||||||
from llama_stack.apis.eval import (
|
from llama_stack.apis.eval import (
|
||||||
AppEvalTaskConfig,
|
BenchmarkConfig,
|
||||||
Eval,
|
Eval,
|
||||||
EvalTaskConfig,
|
|
||||||
EvaluateResponse,
|
EvaluateResponse,
|
||||||
Job,
|
Job,
|
||||||
JobStatus,
|
JobStatus,
|
||||||
|
@ -347,23 +346,23 @@ class EvalRouter(Eval):
|
||||||
|
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
task_config: AppEvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> Job:
|
) -> Job:
|
||||||
return await self.routing_table.get_provider_impl(task_id).run_eval(
|
return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: EvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
return await self.routing_table.get_provider_impl(task_id).evaluate_rows(
|
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
input_rows=input_rows,
|
input_rows=input_rows,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
|
@ -371,30 +370,72 @@ class EvalRouter(Eval):
|
||||||
|
|
||||||
async def job_status(
|
async def job_status(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> Optional[JobStatus]:
|
) -> Optional[JobStatus]:
|
||||||
return await self.routing_table.get_provider_impl(task_id).job_status(task_id, job_id)
|
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
|
||||||
|
|
||||||
async def job_cancel(
|
async def job_cancel(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
await self.routing_table.get_provider_impl(task_id).job_cancel(
|
await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
|
||||||
task_id,
|
benchmark_id,
|
||||||
job_id,
|
job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def job_result(
|
async def job_result(
|
||||||
|
self,
|
||||||
|
benchmark_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> EvaluateResponse:
|
||||||
|
return await self.routing_table.get_provider_impl(benchmark_id).job_result(
|
||||||
|
benchmark_id,
|
||||||
|
job_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def DEPRECATED_run_eval(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> Job:
|
||||||
|
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||||
|
|
||||||
|
async def DEPRECATED_evaluate_rows(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
input_rows: List[Dict[str, Any]],
|
||||||
|
scoring_functions: List[str],
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> EvaluateResponse:
|
||||||
|
return await self.evaluate_rows(
|
||||||
|
benchmark_id=task_id,
|
||||||
|
input_rows=input_rows,
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
task_config=task_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_status(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> Optional[JobStatus]:
|
||||||
|
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_cancel(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> None:
|
||||||
|
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_result(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
task_id: str,
|
||||||
job_id: str,
|
job_id: str,
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
return await self.routing_table.get_provider_impl(task_id).job_result(
|
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||||
task_id,
|
|
||||||
job_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ToolRuntimeRouter(ToolRuntime):
|
class ToolRuntimeRouter(ToolRuntime):
|
||||||
|
|
|
@ -4,14 +4,15 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
import logging
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
|
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
|
||||||
from llama_stack.apis.common.content_types import URL
|
from llama_stack.apis.common.content_types import URL
|
||||||
from llama_stack.apis.common.type_system import ParamType
|
from llama_stack.apis.common.type_system import ParamType
|
||||||
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
|
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
|
||||||
from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
|
|
||||||
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
|
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
|
||||||
from llama_stack.apis.resource import ResourceType
|
from llama_stack.apis.resource import ResourceType
|
||||||
from llama_stack.apis.scoring_functions import (
|
from llama_stack.apis.scoring_functions import (
|
||||||
|
@ -38,6 +39,8 @@ from llama_stack.distribution.datatypes import (
|
||||||
from llama_stack.distribution.store import DistributionRegistry
|
from llama_stack.distribution.store import DistributionRegistry
|
||||||
from llama_stack.providers.datatypes import Api, RoutingTable
|
from llama_stack.providers.datatypes import Api, RoutingTable
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_impl_api(p: Any) -> Api:
|
def get_impl_api(p: Any) -> Api:
|
||||||
return p.__provider_spec__.api
|
return p.__provider_spec__.api
|
||||||
|
@ -60,7 +63,7 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
|
||||||
elif api == Api.scoring:
|
elif api == Api.scoring:
|
||||||
return await p.register_scoring_function(obj)
|
return await p.register_scoring_function(obj)
|
||||||
elif api == Api.eval:
|
elif api == Api.eval:
|
||||||
return await p.register_eval_task(obj)
|
return await p.register_benchmark(obj)
|
||||||
elif api == Api.tool_runtime:
|
elif api == Api.tool_runtime:
|
||||||
return await p.register_tool(obj)
|
return await p.register_tool(obj)
|
||||||
else:
|
else:
|
||||||
|
@ -121,7 +124,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
scoring_functions = await p.list_scoring_functions()
|
scoring_functions = await p.list_scoring_functions()
|
||||||
await add_objects(scoring_functions, pid, ScoringFn)
|
await add_objects(scoring_functions, pid, ScoringFn)
|
||||||
elif api == Api.eval:
|
elif api == Api.eval:
|
||||||
p.eval_task_store = self
|
p.benchmark_store = self
|
||||||
elif api == Api.tool_runtime:
|
elif api == Api.tool_runtime:
|
||||||
p.tool_store = self
|
p.tool_store = self
|
||||||
|
|
||||||
|
@ -141,8 +144,8 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
return ("DatasetIO", "dataset")
|
return ("DatasetIO", "dataset")
|
||||||
elif isinstance(self, ScoringFunctionsRoutingTable):
|
elif isinstance(self, ScoringFunctionsRoutingTable):
|
||||||
return ("Scoring", "scoring_function")
|
return ("Scoring", "scoring_function")
|
||||||
elif isinstance(self, EvalTasksRoutingTable):
|
elif isinstance(self, BenchmarksRoutingTable):
|
||||||
return ("Eval", "eval_task")
|
return ("Eval", "benchmark")
|
||||||
elif isinstance(self, ToolGroupsRoutingTable):
|
elif isinstance(self, ToolGroupsRoutingTable):
|
||||||
return ("Tools", "tool")
|
return ("Tools", "tool")
|
||||||
else:
|
else:
|
||||||
|
@ -428,20 +431,20 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
||||||
await self.register_object(scoring_fn)
|
await self.register_object(scoring_fn)
|
||||||
|
|
||||||
|
|
||||||
class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
|
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||||
async def list_eval_tasks(self) -> ListEvalTasksResponse:
|
async def list_benchmarks(self) -> ListBenchmarksResponse:
|
||||||
return ListEvalTasksResponse(data=await self.get_all_with_type("eval_task"))
|
return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
|
||||||
|
|
||||||
async def get_eval_task(self, eval_task_id: str) -> Optional[EvalTask]:
|
async def get_benchmark(self, benchmark_id: str) -> Optional[Benchmark]:
|
||||||
return await self.get_object_by_identifier("eval_task", eval_task_id)
|
return await self.get_object_by_identifier("benchmark", benchmark_id)
|
||||||
|
|
||||||
async def register_eval_task(
|
async def register_benchmark(
|
||||||
self,
|
self,
|
||||||
eval_task_id: str,
|
benchmark_id: str,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
provider_eval_task_id: Optional[str] = None,
|
provider_benchmark_id: Optional[str] = None,
|
||||||
provider_id: Optional[str] = None,
|
provider_id: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if metadata is None:
|
if metadata is None:
|
||||||
|
@ -453,17 +456,46 @@ class EvalTasksRoutingTable(CommonRoutingTableImpl, EvalTasks):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"No provider specified and multiple providers available. Please specify a provider_id."
|
"No provider specified and multiple providers available. Please specify a provider_id."
|
||||||
)
|
)
|
||||||
if provider_eval_task_id is None:
|
if provider_benchmark_id is None:
|
||||||
provider_eval_task_id = eval_task_id
|
provider_benchmark_id = benchmark_id
|
||||||
eval_task = EvalTask(
|
benchmark = Benchmark(
|
||||||
identifier=eval_task_id,
|
identifier=benchmark_id,
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
provider_id=provider_id,
|
provider_id=provider_id,
|
||||||
provider_resource_id=provider_eval_task_id,
|
provider_resource_id=provider_benchmark_id,
|
||||||
|
)
|
||||||
|
await self.register_object(benchmark)
|
||||||
|
|
||||||
|
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
|
||||||
|
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||||
|
return await self.list_benchmarks()
|
||||||
|
|
||||||
|
async def DEPRECATED_get_eval_task(
|
||||||
|
self,
|
||||||
|
eval_task_id: str,
|
||||||
|
) -> Optional[Benchmark]:
|
||||||
|
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||||
|
return await self.get_benchmark(eval_task_id)
|
||||||
|
|
||||||
|
async def DEPRECATED_register_eval_task(
|
||||||
|
self,
|
||||||
|
eval_task_id: str,
|
||||||
|
dataset_id: str,
|
||||||
|
scoring_functions: List[str],
|
||||||
|
provider_benchmark_id: Optional[str] = None,
|
||||||
|
provider_id: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||||
|
return await self.register_benchmark(
|
||||||
|
benchmark_id=eval_task_id,
|
||||||
|
dataset_id=dataset_id,
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
metadata=metadata,
|
||||||
|
provider_benchmark_id=provider_benchmark_id,
|
||||||
)
|
)
|
||||||
await self.register_object(eval_task)
|
|
||||||
|
|
||||||
|
|
||||||
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||||
|
|
|
@ -15,10 +15,10 @@ from termcolor import colored
|
||||||
|
|
||||||
from llama_stack.apis.agents import Agents
|
from llama_stack.apis.agents import Agents
|
||||||
from llama_stack.apis.batch_inference import BatchInference
|
from llama_stack.apis.batch_inference import BatchInference
|
||||||
|
from llama_stack.apis.benchmarks import Benchmarks
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.eval import Eval
|
from llama_stack.apis.eval import Eval
|
||||||
from llama_stack.apis.eval_tasks import EvalTasks
|
|
||||||
from llama_stack.apis.inference import Inference
|
from llama_stack.apis.inference import Inference
|
||||||
from llama_stack.apis.inspect import Inspect
|
from llama_stack.apis.inspect import Inspect
|
||||||
from llama_stack.apis.models import Models
|
from llama_stack.apis.models import Models
|
||||||
|
@ -53,7 +53,7 @@ class LlamaStack(
|
||||||
PostTraining,
|
PostTraining,
|
||||||
VectorIO,
|
VectorIO,
|
||||||
Eval,
|
Eval,
|
||||||
EvalTasks,
|
Benchmarks,
|
||||||
Scoring,
|
Scoring,
|
||||||
ScoringFunctions,
|
ScoringFunctions,
|
||||||
DatasetIO,
|
DatasetIO,
|
||||||
|
@ -78,7 +78,7 @@ RESOURCES = [
|
||||||
"register_scoring_function",
|
"register_scoring_function",
|
||||||
"list_scoring_functions",
|
"list_scoring_functions",
|
||||||
),
|
),
|
||||||
("eval_tasks", Api.eval_tasks, "register_eval_task", "list_eval_tasks"),
|
("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"),
|
||||||
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
|
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ $ llama-stack-client datasets register \
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ llama-stack-client eval_tasks register \
|
$ llama-stack-client benchmarks register \
|
||||||
--eval-task-id meta-reference-mmlu \
|
--eval-task-id meta-reference-mmlu \
|
||||||
--provider-id meta-reference \
|
--provider-id meta-reference \
|
||||||
--dataset-id mmlu \
|
--dataset-id mmlu \
|
||||||
|
|
|
@ -8,12 +8,12 @@ import streamlit as st
|
||||||
from modules.api import llama_stack_api
|
from modules.api import llama_stack_api
|
||||||
|
|
||||||
|
|
||||||
def eval_tasks():
|
def benchmarks():
|
||||||
# Eval Tasks Section
|
# Benchmarks Section
|
||||||
st.header("Eval Tasks")
|
st.header("Benchmarks")
|
||||||
|
|
||||||
eval_tasks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list()}
|
benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
|
||||||
|
|
||||||
if len(eval_tasks_info) > 0:
|
if len(benchmarks_info) > 0:
|
||||||
selected_eval_task = st.selectbox("Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect")
|
selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
|
||||||
st.json(eval_tasks_info[selected_eval_task], expanded=True)
|
st.json(benchmarks_info[selected_benchmark], expanded=True)
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from page.distribution.benchmarks import benchmarks
|
||||||
from page.distribution.datasets import datasets
|
from page.distribution.datasets import datasets
|
||||||
from page.distribution.eval_tasks import eval_tasks
|
|
||||||
from page.distribution.models import models
|
from page.distribution.models import models
|
||||||
from page.distribution.scoring_functions import scoring_functions
|
from page.distribution.scoring_functions import scoring_functions
|
||||||
from page.distribution.shields import shields
|
from page.distribution.shields import shields
|
||||||
|
@ -20,7 +20,7 @@ def resources_page():
|
||||||
"Shields",
|
"Shields",
|
||||||
"Scoring Functions",
|
"Scoring Functions",
|
||||||
"Datasets",
|
"Datasets",
|
||||||
"Eval Tasks",
|
"Benchmarks",
|
||||||
]
|
]
|
||||||
icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
|
icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"]
|
||||||
selected_resource = option_menu(
|
selected_resource = option_menu(
|
||||||
|
@ -34,8 +34,8 @@ def resources_page():
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
if selected_resource == "Eval Tasks":
|
if selected_resource == "Benchmarks":
|
||||||
eval_tasks()
|
benchmarks()
|
||||||
elif selected_resource == "Vector Databases":
|
elif selected_resource == "Vector Databases":
|
||||||
vector_dbs()
|
vector_dbs()
|
||||||
elif selected_resource == "Datasets":
|
elif selected_resource == "Datasets":
|
||||||
|
|
|
@ -11,28 +11,28 @@ import streamlit as st
|
||||||
from modules.api import llama_stack_api
|
from modules.api import llama_stack_api
|
||||||
|
|
||||||
|
|
||||||
def select_eval_task_1():
|
def select_benchmark_1():
|
||||||
# Select Eval Tasks
|
# Select Benchmarks
|
||||||
st.subheader("1. Choose An Eval Task")
|
st.subheader("1. Choose An Eval Task")
|
||||||
eval_tasks = llama_stack_api.client.eval_tasks.list()
|
benchmarks = llama_stack_api.client.benchmarks.list()
|
||||||
eval_tasks = {et.identifier: et for et in eval_tasks}
|
benchmarks = {et.identifier: et for et in benchmarks}
|
||||||
eval_tasks_names = list(eval_tasks.keys())
|
benchmarks_names = list(benchmarks.keys())
|
||||||
selected_eval_task = st.selectbox(
|
selected_benchmark = st.selectbox(
|
||||||
"Choose an eval task.",
|
"Choose an eval task.",
|
||||||
options=eval_tasks_names,
|
options=benchmarks_names,
|
||||||
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
|
||||||
)
|
)
|
||||||
with st.expander("View Eval Task"):
|
with st.expander("View Eval Task"):
|
||||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
st.json(benchmarks[selected_benchmark], expanded=True)
|
||||||
|
|
||||||
st.session_state["selected_eval_task"] = selected_eval_task
|
st.session_state["selected_benchmark"] = selected_benchmark
|
||||||
st.session_state["eval_tasks"] = eval_tasks
|
st.session_state["benchmarks"] = benchmarks
|
||||||
if st.button("Confirm", key="confirm_1"):
|
if st.button("Confirm", key="confirm_1"):
|
||||||
st.session_state["selected_eval_task_1_next"] = True
|
st.session_state["selected_benchmark_1_next"] = True
|
||||||
|
|
||||||
|
|
||||||
def define_eval_candidate_2():
|
def define_eval_candidate_2():
|
||||||
if not st.session_state.get("selected_eval_task_1_next", None):
|
if not st.session_state.get("selected_benchmark_1_next", None):
|
||||||
return
|
return
|
||||||
|
|
||||||
st.subheader("2. Define Eval Candidate")
|
st.subheader("2. Define Eval Candidate")
|
||||||
|
@ -161,11 +161,11 @@ def run_evaluation_3():
|
||||||
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
selected_eval_task = st.session_state["selected_eval_task"]
|
selected_benchmark = st.session_state["selected_benchmark"]
|
||||||
eval_tasks = st.session_state["eval_tasks"]
|
benchmarks = st.session_state["benchmarks"]
|
||||||
eval_candidate = st.session_state["eval_candidate"]
|
eval_candidate = st.session_state["eval_candidate"]
|
||||||
|
|
||||||
dataset_id = eval_tasks[selected_eval_task].dataset_id
|
dataset_id = benchmarks[selected_benchmark].dataset_id
|
||||||
rows = llama_stack_api.client.datasetio.get_rows_paginated(
|
rows = llama_stack_api.client.datasetio.get_rows_paginated(
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
rows_in_page=-1,
|
rows_in_page=-1,
|
||||||
|
@ -180,16 +180,16 @@ def run_evaluation_3():
|
||||||
help="Number of examples from the dataset to evaluate. ",
|
help="Number of examples from the dataset to evaluate. ",
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_task_config = {
|
benchmark_config = {
|
||||||
"type": "benchmark",
|
"type": "benchmark",
|
||||||
"eval_candidate": eval_candidate,
|
"eval_candidate": eval_candidate,
|
||||||
"scoring_params": {},
|
"scoring_params": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
with st.expander("View Evaluation Task", expanded=True):
|
with st.expander("View Evaluation Task", expanded=True):
|
||||||
st.json(eval_tasks[selected_eval_task], expanded=True)
|
st.json(benchmarks[selected_benchmark], expanded=True)
|
||||||
with st.expander("View Evaluation Task Configuration", expanded=True):
|
with st.expander("View Evaluation Task Configuration", expanded=True):
|
||||||
st.json(eval_task_config, expanded=True)
|
st.json(benchmark_config, expanded=True)
|
||||||
|
|
||||||
# Add run button and handle evaluation
|
# Add run button and handle evaluation
|
||||||
if st.button("Run Evaluation"):
|
if st.button("Run Evaluation"):
|
||||||
|
@ -209,10 +209,10 @@ def run_evaluation_3():
|
||||||
progress_bar.progress(progress, text=progress_text)
|
progress_bar.progress(progress, text=progress_text)
|
||||||
# Run evaluation for current row
|
# Run evaluation for current row
|
||||||
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
eval_res = llama_stack_api.client.eval.evaluate_rows(
|
||||||
task_id=selected_eval_task,
|
benchmark_id=selected_benchmark,
|
||||||
input_rows=[r],
|
input_rows=[r],
|
||||||
scoring_functions=eval_tasks[selected_eval_task].scoring_functions,
|
scoring_functions=benchmarks[selected_benchmark].scoring_functions,
|
||||||
task_config=eval_task_config,
|
task_config=benchmark_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
for k in r.keys():
|
for k in r.keys():
|
||||||
|
@ -225,7 +225,7 @@ def run_evaluation_3():
|
||||||
output_res[k] = []
|
output_res[k] = []
|
||||||
output_res[k].append(eval_res.generations[0][k])
|
output_res[k].append(eval_res.generations[0][k])
|
||||||
|
|
||||||
for scoring_fn in eval_tasks[selected_eval_task].scoring_functions:
|
for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
|
||||||
if scoring_fn not in output_res:
|
if scoring_fn not in output_res:
|
||||||
output_res[scoring_fn] = []
|
output_res[scoring_fn] = []
|
||||||
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
|
output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
|
||||||
|
@ -245,7 +245,7 @@ def native_evaluation_page():
|
||||||
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
|
||||||
st.title("📊 Evaluations (Generation + Scoring)")
|
st.title("📊 Evaluations (Generation + Scoring)")
|
||||||
|
|
||||||
select_eval_task_1()
|
select_benchmark_1()
|
||||||
define_eval_candidate_2()
|
define_eval_candidate_2()
|
||||||
run_evaluation_3()
|
run_evaluation_3()
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,9 @@ from urllib.parse import urlparse
|
||||||
from llama_models.schema_utils import json_schema_type
|
from llama_models.schema_utils import json_schema_type
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark
|
||||||
from llama_stack.apis.datasets import Dataset
|
from llama_stack.apis.datasets import Dataset
|
||||||
from llama_stack.apis.datatypes import Api
|
from llama_stack.apis.datatypes import Api
|
||||||
from llama_stack.apis.eval_tasks import EvalTask
|
|
||||||
from llama_stack.apis.models import Model
|
from llama_stack.apis.models import Model
|
||||||
from llama_stack.apis.scoring_functions import ScoringFn
|
from llama_stack.apis.scoring_functions import ScoringFn
|
||||||
from llama_stack.apis.shields import Shield
|
from llama_stack.apis.shields import Shield
|
||||||
|
@ -48,8 +48,8 @@ class ScoringFunctionsProtocolPrivate(Protocol):
|
||||||
async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
|
async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
class EvalTasksProtocolPrivate(Protocol):
|
class BenchmarksProtocolPrivate(Protocol):
|
||||||
async def register_eval_task(self, eval_task: EvalTask) -> None: ...
|
async def register_benchmark(self, benchmark: Benchmark) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
class ToolsProtocolPrivate(Protocol):
|
class ToolsProtocolPrivate(Protocol):
|
||||||
|
|
|
@ -8,13 +8,13 @@ from typing import Any, Dict, List, Optional
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from llama_stack.apis.agents import Agents, StepType
|
from llama_stack.apis.agents import Agents, StepType
|
||||||
|
from llama_stack.apis.benchmarks import Benchmark
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.eval_tasks import EvalTask
|
|
||||||
from llama_stack.apis.inference import Inference, UserMessage
|
from llama_stack.apis.inference import Inference, UserMessage
|
||||||
from llama_stack.apis.scoring import Scoring
|
from llama_stack.apis.scoring import Scoring
|
||||||
from llama_stack.distribution.datatypes import Api
|
from llama_stack.distribution.datatypes import Api
|
||||||
from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
|
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
|
||||||
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
|
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
|
||||||
MEMORY_QUERY_TOOL,
|
MEMORY_QUERY_TOOL,
|
||||||
)
|
)
|
||||||
|
@ -26,15 +26,15 @@ from llama_stack.providers.utils.common.data_schema_validator import (
|
||||||
from llama_stack.providers.utils.kvstore import kvstore_impl
|
from llama_stack.providers.utils.kvstore import kvstore_impl
|
||||||
|
|
||||||
from .....apis.common.job_types import Job
|
from .....apis.common.job_types import Job
|
||||||
from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus
|
from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
|
||||||
from .config import MetaReferenceEvalConfig
|
from .config import MetaReferenceEvalConfig
|
||||||
|
|
||||||
EVAL_TASKS_PREFIX = "eval_tasks:"
|
EVAL_TASKS_PREFIX = "benchmarks:"
|
||||||
|
|
||||||
|
|
||||||
class MetaReferenceEvalImpl(
|
class MetaReferenceEvalImpl(
|
||||||
Eval,
|
Eval,
|
||||||
EvalTasksProtocolPrivate,
|
BenchmarksProtocolPrivate,
|
||||||
):
|
):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -55,36 +55,36 @@ class MetaReferenceEvalImpl(
|
||||||
# TODO: assume sync job, will need jobs API for async scheduling
|
# TODO: assume sync job, will need jobs API for async scheduling
|
||||||
self.jobs = {}
|
self.jobs = {}
|
||||||
|
|
||||||
self.eval_tasks = {}
|
self.benchmarks = {}
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
self.kvstore = await kvstore_impl(self.config.kvstore)
|
self.kvstore = await kvstore_impl(self.config.kvstore)
|
||||||
# Load existing eval_tasks from kvstore
|
# Load existing benchmarks from kvstore
|
||||||
start_key = EVAL_TASKS_PREFIX
|
start_key = EVAL_TASKS_PREFIX
|
||||||
end_key = f"{EVAL_TASKS_PREFIX}\xff"
|
end_key = f"{EVAL_TASKS_PREFIX}\xff"
|
||||||
stored_eval_tasks = await self.kvstore.range(start_key, end_key)
|
stored_benchmarks = await self.kvstore.range(start_key, end_key)
|
||||||
|
|
||||||
for eval_task in stored_eval_tasks:
|
for benchmark in stored_benchmarks:
|
||||||
eval_task = EvalTask.model_validate_json(eval_task)
|
benchmark = Benchmark.model_validate_json(benchmark)
|
||||||
self.eval_tasks[eval_task.identifier] = eval_task
|
self.benchmarks[benchmark.identifier] = benchmark
|
||||||
|
|
||||||
async def shutdown(self) -> None: ...
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
async def register_eval_task(self, task_def: EvalTask) -> None:
|
async def register_benchmark(self, task_def: Benchmark) -> None:
|
||||||
# Store in kvstore
|
# Store in kvstore
|
||||||
key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
|
key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
|
||||||
await self.kvstore.set(
|
await self.kvstore.set(
|
||||||
key=key,
|
key=key,
|
||||||
value=task_def.model_dump_json(),
|
value=task_def.model_dump_json(),
|
||||||
)
|
)
|
||||||
self.eval_tasks[task_def.identifier] = task_def
|
self.benchmarks[task_def.identifier] = task_def
|
||||||
|
|
||||||
async def run_eval(
|
async def run_eval(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
task_config: EvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> Job:
|
) -> Job:
|
||||||
task_def = self.eval_tasks[task_id]
|
task_def = self.benchmarks[benchmark_id]
|
||||||
dataset_id = task_def.dataset_id
|
dataset_id = task_def.dataset_id
|
||||||
candidate = task_config.eval_candidate
|
candidate = task_config.eval_candidate
|
||||||
scoring_functions = task_def.scoring_functions
|
scoring_functions = task_def.scoring_functions
|
||||||
|
@ -95,7 +95,7 @@ class MetaReferenceEvalImpl(
|
||||||
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
|
||||||
)
|
)
|
||||||
res = await self.evaluate_rows(
|
res = await self.evaluate_rows(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
input_rows=all_rows.rows,
|
input_rows=all_rows.rows,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
task_config=task_config,
|
task_config=task_config,
|
||||||
|
@ -108,7 +108,7 @@ class MetaReferenceEvalImpl(
|
||||||
return Job(job_id=job_id)
|
return Job(job_id=job_id)
|
||||||
|
|
||||||
async def _run_agent_generation(
|
async def _run_agent_generation(
|
||||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
candidate = task_config.eval_candidate
|
candidate = task_config.eval_candidate
|
||||||
create_response = await self.agents_api.create_agent(candidate.config)
|
create_response = await self.agents_api.create_agent(candidate.config)
|
||||||
|
@ -151,7 +151,7 @@ class MetaReferenceEvalImpl(
|
||||||
return generations
|
return generations
|
||||||
|
|
||||||
async def _run_model_generation(
|
async def _run_model_generation(
|
||||||
self, input_rows: List[Dict[str, Any]], task_config: EvalTaskConfig
|
self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
candidate = task_config.eval_candidate
|
candidate = task_config.eval_candidate
|
||||||
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
|
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
|
||||||
|
@ -187,10 +187,10 @@ class MetaReferenceEvalImpl(
|
||||||
|
|
||||||
async def evaluate_rows(
|
async def evaluate_rows(
|
||||||
self,
|
self,
|
||||||
task_id: str,
|
benchmark_id: str,
|
||||||
input_rows: List[Dict[str, Any]],
|
input_rows: List[Dict[str, Any]],
|
||||||
scoring_functions: List[str],
|
scoring_functions: List[str],
|
||||||
task_config: EvalTaskConfig,
|
task_config: BenchmarkConfig,
|
||||||
) -> EvaluateResponse:
|
) -> EvaluateResponse:
|
||||||
candidate = task_config.eval_candidate
|
candidate = task_config.eval_candidate
|
||||||
if candidate.type == "agent":
|
if candidate.type == "agent":
|
||||||
|
@ -203,7 +203,7 @@ class MetaReferenceEvalImpl(
|
||||||
# scoring with generated_answer
|
# scoring with generated_answer
|
||||||
score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
|
score_input_rows = [input_r | generated_r for input_r, generated_r in zip(input_rows, generations)]
|
||||||
|
|
||||||
if task_config.type == "app" and task_config.scoring_params is not None:
|
if task_config.scoring_params is not None:
|
||||||
scoring_functions_dict = {
|
scoring_functions_dict = {
|
||||||
scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
|
scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
|
||||||
for scoring_fn_id in scoring_functions
|
for scoring_fn_id in scoring_functions
|
||||||
|
@ -217,18 +217,60 @@ class MetaReferenceEvalImpl(
|
||||||
|
|
||||||
return EvaluateResponse(generations=generations, scores=score_response.results)
|
return EvaluateResponse(generations=generations, scores=score_response.results)
|
||||||
|
|
||||||
async def job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]:
|
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
|
||||||
if job_id in self.jobs:
|
if job_id in self.jobs:
|
||||||
return JobStatus.completed
|
return JobStatus.completed
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def job_cancel(self, task_id: str, job_id: str) -> None:
|
async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
|
||||||
raise NotImplementedError("Job cancel is not implemented yet")
|
raise NotImplementedError("Job cancel is not implemented yet")
|
||||||
|
|
||||||
async def job_result(self, task_id: str, job_id: str) -> EvaluateResponse:
|
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
|
||||||
status = await self.job_status(task_id, job_id)
|
status = await self.job_status(benchmark_id, job_id)
|
||||||
if not status or status != JobStatus.completed:
|
if not status or status != JobStatus.completed:
|
||||||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||||
|
|
||||||
return self.jobs[job_id]
|
return self.jobs[job_id]
|
||||||
|
|
||||||
|
async def DEPRECATED_run_eval(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> Job:
|
||||||
|
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||||
|
|
||||||
|
async def DEPRECATED_evaluate_rows(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
input_rows: List[Dict[str, Any]],
|
||||||
|
scoring_functions: List[str],
|
||||||
|
task_config: BenchmarkConfig,
|
||||||
|
) -> EvaluateResponse:
|
||||||
|
return await self.evaluate_rows(
|
||||||
|
benchmark_id=task_id,
|
||||||
|
input_rows=input_rows,
|
||||||
|
scoring_functions=scoring_functions,
|
||||||
|
task_config=task_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_status(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> Optional[JobStatus]:
|
||||||
|
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_cancel(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> None:
|
||||||
|
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||||
|
|
||||||
|
async def DEPRECATED_job_result(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
job_id: str,
|
||||||
|
) -> EvaluateResponse:
|
||||||
|
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||||
|
|
|
@ -10,8 +10,8 @@ import pytest
|
||||||
from llama_stack.apis.common.content_types import URL
|
from llama_stack.apis.common.content_types import URL
|
||||||
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
|
from llama_stack.apis.common.type_system import ChatCompletionInputType, StringType
|
||||||
from llama_stack.apis.eval.eval import (
|
from llama_stack.apis.eval.eval import (
|
||||||
AppEvalTaskConfig,
|
AppBenchmarkConfig,
|
||||||
BenchmarkEvalTaskConfig,
|
BenchmarkBenchmarkConfig,
|
||||||
ModelCandidate,
|
ModelCandidate,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.inference import SamplingParams
|
from llama_stack.apis.inference import SamplingParams
|
||||||
|
@ -30,18 +30,18 @@ from .constants import JUDGE_PROMPT
|
||||||
|
|
||||||
class Testeval:
|
class Testeval:
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_eval_tasks_list(self, eval_stack):
|
async def test_benchmarks_list(self, eval_stack):
|
||||||
# NOTE: this needs you to ensure that you are starting from a clean state
|
# NOTE: this needs you to ensure that you are starting from a clean state
|
||||||
# but so far we don't have an unregister API unfortunately, so be careful
|
# but so far we don't have an unregister API unfortunately, so be careful
|
||||||
eval_tasks_impl = eval_stack[Api.eval_tasks]
|
benchmarks_impl = eval_stack[Api.benchmarks]
|
||||||
response = await eval_tasks_impl.list_eval_tasks()
|
response = await benchmarks_impl.list_benchmarks()
|
||||||
assert isinstance(response, list)
|
assert isinstance(response, list)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
|
async def test_eval_evaluate_rows(self, eval_stack, inference_model, judge_model):
|
||||||
eval_impl, eval_tasks_impl, datasetio_impl, datasets_impl, models_impl = (
|
eval_impl, benchmarks_impl, datasetio_impl, datasets_impl, models_impl = (
|
||||||
eval_stack[Api.eval],
|
eval_stack[Api.eval],
|
||||||
eval_stack[Api.eval_tasks],
|
eval_stack[Api.benchmarks],
|
||||||
eval_stack[Api.datasetio],
|
eval_stack[Api.datasetio],
|
||||||
eval_stack[Api.datasets],
|
eval_stack[Api.datasets],
|
||||||
eval_stack[Api.models],
|
eval_stack[Api.models],
|
||||||
|
@ -59,17 +59,17 @@ class Testeval:
|
||||||
scoring_functions = [
|
scoring_functions = [
|
||||||
"basic::equality",
|
"basic::equality",
|
||||||
]
|
]
|
||||||
task_id = "meta-reference::app_eval"
|
benchmark_id = "meta-reference::app_eval"
|
||||||
await eval_tasks_impl.register_eval_task(
|
await benchmarks_impl.register_benchmark(
|
||||||
eval_task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
dataset_id="test_dataset_for_eval",
|
dataset_id="test_dataset_for_eval",
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
)
|
)
|
||||||
response = await eval_impl.evaluate_rows(
|
response = await eval_impl.evaluate_rows(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
input_rows=rows.rows,
|
input_rows=rows.rows,
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
task_config=AppEvalTaskConfig(
|
task_config=AppBenchmarkConfig(
|
||||||
eval_candidate=ModelCandidate(
|
eval_candidate=ModelCandidate(
|
||||||
model=inference_model,
|
model=inference_model,
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
|
@ -92,9 +92,9 @@ class Testeval:
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
|
async def test_eval_run_eval(self, eval_stack, inference_model, judge_model):
|
||||||
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
|
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
|
||||||
eval_stack[Api.eval],
|
eval_stack[Api.eval],
|
||||||
eval_stack[Api.eval_tasks],
|
eval_stack[Api.benchmarks],
|
||||||
eval_stack[Api.datasets],
|
eval_stack[Api.datasets],
|
||||||
eval_stack[Api.models],
|
eval_stack[Api.models],
|
||||||
)
|
)
|
||||||
|
@ -105,15 +105,15 @@ class Testeval:
|
||||||
"basic::subset_of",
|
"basic::subset_of",
|
||||||
]
|
]
|
||||||
|
|
||||||
task_id = "meta-reference::app_eval-2"
|
benchmark_id = "meta-reference::app_eval-2"
|
||||||
await eval_tasks_impl.register_eval_task(
|
await benchmarks_impl.register_benchmark(
|
||||||
eval_task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
dataset_id="test_dataset_for_eval",
|
dataset_id="test_dataset_for_eval",
|
||||||
scoring_functions=scoring_functions,
|
scoring_functions=scoring_functions,
|
||||||
)
|
)
|
||||||
response = await eval_impl.run_eval(
|
response = await eval_impl.run_eval(
|
||||||
task_id=task_id,
|
benchmark_id=benchmark_id,
|
||||||
task_config=AppEvalTaskConfig(
|
task_config=AppBenchmarkConfig(
|
||||||
eval_candidate=ModelCandidate(
|
eval_candidate=ModelCandidate(
|
||||||
model=inference_model,
|
model=inference_model,
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
|
@ -121,9 +121,9 @@ class Testeval:
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
assert response.job_id == "0"
|
assert response.job_id == "0"
|
||||||
job_status = await eval_impl.job_status(task_id, response.job_id)
|
job_status = await eval_impl.job_status(benchmark_id, response.job_id)
|
||||||
assert job_status and job_status.value == "completed"
|
assert job_status and job_status.value == "completed"
|
||||||
eval_response = await eval_impl.job_result(task_id, response.job_id)
|
eval_response = await eval_impl.job_result(benchmark_id, response.job_id)
|
||||||
|
|
||||||
assert eval_response is not None
|
assert eval_response is not None
|
||||||
assert len(eval_response.generations) == 5
|
assert len(eval_response.generations) == 5
|
||||||
|
@ -131,9 +131,9 @@ class Testeval:
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
|
async def test_eval_run_benchmark_eval(self, eval_stack, inference_model):
|
||||||
eval_impl, eval_tasks_impl, datasets_impl, models_impl = (
|
eval_impl, benchmarks_impl, datasets_impl, models_impl = (
|
||||||
eval_stack[Api.eval],
|
eval_stack[Api.eval],
|
||||||
eval_stack[Api.eval_tasks],
|
eval_stack[Api.benchmarks],
|
||||||
eval_stack[Api.datasets],
|
eval_stack[Api.datasets],
|
||||||
eval_stack[Api.models],
|
eval_stack[Api.models],
|
||||||
)
|
)
|
||||||
|
@ -159,20 +159,20 @@ class Testeval:
|
||||||
)
|
)
|
||||||
|
|
||||||
# register eval task
|
# register eval task
|
||||||
await eval_tasks_impl.register_eval_task(
|
await benchmarks_impl.register_benchmark(
|
||||||
eval_task_id="meta-reference-mmlu",
|
benchmark_id="meta-reference-mmlu",
|
||||||
dataset_id="mmlu",
|
dataset_id="mmlu",
|
||||||
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# list benchmarks
|
# list benchmarks
|
||||||
response = await eval_tasks_impl.list_eval_tasks()
|
response = await benchmarks_impl.list_benchmarks()
|
||||||
assert len(response) > 0
|
assert len(response) > 0
|
||||||
|
|
||||||
benchmark_id = "meta-reference-mmlu"
|
benchmark_id = "meta-reference-mmlu"
|
||||||
response = await eval_impl.run_eval(
|
response = await eval_impl.run_eval(
|
||||||
task_id=benchmark_id,
|
benchmark_id=benchmark_id,
|
||||||
task_config=BenchmarkEvalTaskConfig(
|
task_config=BenchmarkBenchmarkConfig(
|
||||||
eval_candidate=ModelCandidate(
|
eval_candidate=ModelCandidate(
|
||||||
model=inference_model,
|
model=inference_model,
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
|
|
|
@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from llama_stack.apis.benchmarks import BenchmarkInput
|
||||||
from llama_stack.apis.datasets import DatasetInput
|
from llama_stack.apis.datasets import DatasetInput
|
||||||
from llama_stack.apis.eval_tasks import EvalTaskInput
|
|
||||||
from llama_stack.apis.models import ModelInput
|
from llama_stack.apis.models import ModelInput
|
||||||
from llama_stack.apis.scoring_functions import ScoringFnInput
|
from llama_stack.apis.scoring_functions import ScoringFnInput
|
||||||
from llama_stack.apis.shields import ShieldInput
|
from llama_stack.apis.shields import ShieldInput
|
||||||
|
@ -42,7 +42,7 @@ async def construct_stack_for_test(
|
||||||
vector_dbs: Optional[List[VectorDBInput]] = None,
|
vector_dbs: Optional[List[VectorDBInput]] = None,
|
||||||
datasets: Optional[List[DatasetInput]] = None,
|
datasets: Optional[List[DatasetInput]] = None,
|
||||||
scoring_fns: Optional[List[ScoringFnInput]] = None,
|
scoring_fns: Optional[List[ScoringFnInput]] = None,
|
||||||
eval_tasks: Optional[List[EvalTaskInput]] = None,
|
benchmarks: Optional[List[BenchmarkInput]] = None,
|
||||||
tool_groups: Optional[List[ToolGroupInput]] = None,
|
tool_groups: Optional[List[ToolGroupInput]] = None,
|
||||||
) -> TestStack:
|
) -> TestStack:
|
||||||
sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
|
sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
|
||||||
|
@ -56,7 +56,7 @@ async def construct_stack_for_test(
|
||||||
vector_dbs=vector_dbs or [],
|
vector_dbs=vector_dbs or [],
|
||||||
datasets=datasets or [],
|
datasets=datasets or [],
|
||||||
scoring_fns=scoring_fns or [],
|
scoring_fns=scoring_fns or [],
|
||||||
eval_tasks=eval_tasks or [],
|
benchmarks=benchmarks or [],
|
||||||
tool_groups=tool_groups or [],
|
tool_groups=tool_groups or [],
|
||||||
)
|
)
|
||||||
run_config = parse_and_maybe_upgrade_config(run_config)
|
run_config = parse_and_maybe_upgrade_config(run_config)
|
||||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -108,7 +108,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: brave-search
|
provider_id: brave-search
|
||||||
|
|
|
@ -99,7 +99,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: brave-search
|
provider_id: brave-search
|
||||||
|
|
|
@ -85,4 +85,4 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
|
|
|
@ -164,7 +164,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -153,7 +153,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -116,7 +116,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -106,7 +106,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -116,7 +116,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -106,7 +106,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -139,7 +139,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -113,7 +113,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -110,7 +110,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -107,7 +107,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -118,7 +118,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -106,7 +106,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -105,7 +105,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -159,7 +159,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -148,7 +148,7 @@ shields:
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
|
@ -109,7 +109,7 @@ shields: []
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
eval_tasks: []
|
benchmarks: []
|
||||||
tool_groups:
|
tool_groups:
|
||||||
- toolgroup_id: builtin::websearch
|
- toolgroup_id: builtin::websearch
|
||||||
provider_id: tavily-search
|
provider_id: tavily-search
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue