chore!: deprecate eval/tasks (#1186)

# What does this PR do?
- Fully deprecate eval/tasks

[//]: # (If resolving an issue, uncomment and update the line below)
Closes #1088 

NOTE: this will be a breaking change. We have introduced the new API in
0.1.3 .

Notebook has been updated to use the new endpoints.

## Test Plan
```
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb 
```
<img width="611" alt="image"
src="https://github.com/user-attachments/assets/79f6efe1-81ba-494e-bf36-1fc0c2b9bc6f"
/>



cc @SLR722  for awareness

[//]: # (## Documentation)
This commit is contained in:
Xi Yan 2025-02-20 14:06:21 -08:00 committed by GitHub
parent 07ccf908f7
commit ea1faae50e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1358 additions and 2161 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1017,14 +1017,14 @@
" \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n", " \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
"}\n", "}\n",
"\n", "\n",
"client.eval_tasks.register(\n", "client.benchmarks.register(\n",
" eval_task_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" dataset_id=f\"mmmu-{subset}-{split}\",\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows(\n", "response = client.eval.evaluate_rows_alpha(\n",
" task_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n", " input_rows=eval_rows,\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" task_config={\n", " task_config={\n",
@ -1196,14 +1196,14 @@
" provider_id=\"together\",\n", " provider_id=\"together\",\n",
")\n", ")\n",
"\n", "\n",
"client.eval_tasks.register(\n", "client.benchmarks.register(\n",
" eval_task_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" dataset_id=simpleqa_dataset_id,\n", " dataset_id=simpleqa_dataset_id,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows(\n", "response = client.eval.evaluate_rows_alpha(\n",
" task_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n", " task_config={\n",
@ -1351,8 +1351,8 @@
" \"enable_session_persistence\": False,\n", " \"enable_session_persistence\": False,\n",
"}\n", "}\n",
"\n", "\n",
"response = client.eval.evaluate_rows(\n", "response = client.eval.evaluate_rows_alpha(\n",
" task_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n", " task_config={\n",

View file

@ -64,23 +64,3 @@ class Benchmarks(Protocol):
provider_id: Optional[str] = None, provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None,
) -> None: ... ) -> None: ...
@webmethod(route="/eval-tasks", method="GET")
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
async def DEPRECATED_get_eval_task(
self,
eval_task_id: str,
) -> Optional[Benchmark]: ...
@webmethod(route="/eval-tasks", method="POST")
async def DEPRECATED_register_eval_task(
self,
eval_task_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None: ...

View file

@ -39,7 +39,6 @@ EvalCandidate = register_schema(
@json_schema_type @json_schema_type
class BenchmarkConfig(BaseModel): class BenchmarkConfig(BaseModel):
type: Literal["benchmark"] = "benchmark"
eval_candidate: EvalCandidate eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field( scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run", description="Map between scoring function id and parameters for each scoring function you want to run",
@ -84,28 +83,3 @@ class Eval(Protocol):
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job: ...
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...

View file

@ -411,48 +411,6 @@ class EvalRouter(Eval):
job_id, job_id,
) )
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job:
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse:
return await self.evaluate_rows(
benchmark_id=task_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
)
async def DEPRECATED_job_status(
self,
task_id: str,
job_id: str,
) -> Optional[JobStatus]:
return await self.job_status(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_cancel(
self,
task_id: str,
job_id: str,
) -> None:
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_result(
self,
task_id: str,
job_id: str,
) -> EvaluateResponse:
return await self.job_result(benchmark_id=task_id, job_id=job_id)
class ToolRuntimeRouter(ToolRuntime): class ToolRuntimeRouter(ToolRuntime):
class RagToolImpl(RAGToolRuntime): class RagToolImpl(RAGToolRuntime):

View file

@ -468,35 +468,6 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
) )
await self.register_object(benchmark) await self.register_object(benchmark)
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.list_benchmarks()
async def DEPRECATED_get_eval_task(
self,
eval_task_id: str,
) -> Optional[Benchmark]:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.get_benchmark(eval_task_id)
async def DEPRECATED_register_eval_task(
self,
eval_task_id: str,
dataset_id: str,
scoring_functions: List[str],
provider_benchmark_id: Optional[str] = None,
provider_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> None:
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
return await self.register_benchmark(
benchmark_id=eval_task_id,
dataset_id=dataset_id,
scoring_functions=scoring_functions,
metadata=metadata,
provider_benchmark_id=provider_benchmark_id,
)
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse: async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:

View file

@ -234,45 +234,3 @@ class MetaReferenceEvalImpl(
raise ValueError(f"Job is not completed, Status: {status.value}") raise ValueError(f"Job is not completed, Status: {status.value}")
return self.jobs[job_id] return self.jobs[job_id]
async def DEPRECATED_run_eval(
self,
task_id: str,
task_config: BenchmarkConfig,
) -> Job:
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
async def DEPRECATED_evaluate_rows(
self,
task_id: str,
input_rows: List[Dict[str, Any]],
scoring_functions: List[str],
task_config: BenchmarkConfig,
) -> EvaluateResponse:
return await self.evaluate_rows(
benchmark_id=task_id,
input_rows=input_rows,
scoring_functions=scoring_functions,
task_config=task_config,
)
async def DEPRECATED_job_status(
self,
task_id: str,
job_id: str,
) -> Optional[JobStatus]:
return await self.job_status(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_cancel(
self,
task_id: str,
job_id: str,
) -> None:
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
async def DEPRECATED_job_result(
self,
task_id: str,
job_id: str,
) -> EvaluateResponse:
return await self.job_result(benchmark_id=task_id, job_id=job_id)