mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
chore!: deprecate eval/tasks (#1186)
# What does this PR do? - Fully deprecate eval/tasks [//]: # (If resolving an issue, uncomment and update the line below) Closes #1088 NOTE: this will be a breaking change. We have introduced the new API in 0.1.3 . Notebook has been updated to use the new endpoints. ## Test Plan ``` pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="611" alt="image" src="https://github.com/user-attachments/assets/79f6efe1-81ba-494e-bf36-1fc0c2b9bc6f" /> cc @SLR722 for awareness [//]: # (## Documentation)
This commit is contained in:
parent
07ccf908f7
commit
ea1faae50e
8 changed files with 1358 additions and 2161 deletions
1969
docs/_static/llama-stack-spec.html
vendored
1969
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1303
docs/_static/llama-stack-spec.yaml
vendored
1303
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -1017,14 +1017,14 @@
|
|||
" \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"client.eval_tasks.register(\n",
|
||||
" eval_task_id=\"meta-reference::mmmu\",\n",
|
||||
"client.benchmarks.register(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::mmmu\",\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||
" input_rows=eval_rows,\n",
|
||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||
" task_config={\n",
|
||||
|
@ -1196,14 +1196,14 @@
|
|||
" provider_id=\"together\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"client.eval_tasks.register(\n",
|
||||
" eval_task_id=\"meta-reference::simpleqa\",\n",
|
||||
"client.benchmarks.register(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" dataset_id=simpleqa_dataset_id,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::simpleqa\",\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
|
@ -1351,8 +1351,8 @@
|
|||
" \"enable_session_persistence\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"response = client.eval.evaluate_rows(\n",
|
||||
" task_id=\"meta-reference::simpleqa\",\n",
|
||||
"response = client.eval.evaluate_rows_alpha(\n",
|
||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||
" input_rows=eval_rows.rows,\n",
|
||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||
" task_config={\n",
|
||||
|
|
|
@ -64,23 +64,3 @@ class Benchmarks(Protocol):
|
|||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="GET")
|
||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
|
||||
|
||||
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
|
||||
async def DEPRECATED_get_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
) -> Optional[Benchmark]: ...
|
||||
|
||||
@webmethod(route="/eval-tasks", method="POST")
|
||||
async def DEPRECATED_register_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None: ...
|
||||
|
|
|
@ -39,7 +39,6 @@ EvalCandidate = register_schema(
|
|||
|
||||
@json_schema_type
|
||||
class BenchmarkConfig(BaseModel):
|
||||
type: Literal["benchmark"] = "benchmark"
|
||||
eval_candidate: EvalCandidate
|
||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||
|
@ -84,28 +83,3 @@ class Eval(Protocol):
|
|||
|
||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
||||
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
||||
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
|
||||
|
||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
||||
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
|
||||
|
|
|
@ -411,48 +411,6 @@ class EvalRouter(Eval):
|
|||
job_id,
|
||||
)
|
||||
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.evaluate_rows(
|
||||
benchmark_id=task_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def DEPRECATED_job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_result(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
|
||||
class ToolRuntimeRouter(ToolRuntime):
|
||||
class RagToolImpl(RAGToolRuntime):
|
||||
|
|
|
@ -468,35 +468,6 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
|||
)
|
||||
await self.register_object(benchmark)
|
||||
|
||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.list_benchmarks()
|
||||
|
||||
async def DEPRECATED_get_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
) -> Optional[Benchmark]:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.get_benchmark(eval_task_id)
|
||||
|
||||
async def DEPRECATED_register_eval_task(
|
||||
self,
|
||||
eval_task_id: str,
|
||||
dataset_id: str,
|
||||
scoring_functions: List[str],
|
||||
provider_benchmark_id: Optional[str] = None,
|
||||
provider_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
||||
return await self.register_benchmark(
|
||||
benchmark_id=eval_task_id,
|
||||
dataset_id=dataset_id,
|
||||
scoring_functions=scoring_functions,
|
||||
metadata=metadata,
|
||||
provider_benchmark_id=provider_benchmark_id,
|
||||
)
|
||||
|
||||
|
||||
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||
async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:
|
||||
|
|
|
@ -234,45 +234,3 @@ class MetaReferenceEvalImpl(
|
|||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||
|
||||
return self.jobs[job_id]
|
||||
|
||||
async def DEPRECATED_run_eval(
|
||||
self,
|
||||
task_id: str,
|
||||
task_config: BenchmarkConfig,
|
||||
) -> Job:
|
||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
||||
|
||||
async def DEPRECATED_evaluate_rows(
|
||||
self,
|
||||
task_id: str,
|
||||
input_rows: List[Dict[str, Any]],
|
||||
scoring_functions: List[str],
|
||||
task_config: BenchmarkConfig,
|
||||
) -> EvaluateResponse:
|
||||
return await self.evaluate_rows(
|
||||
benchmark_id=task_id,
|
||||
input_rows=input_rows,
|
||||
scoring_functions=scoring_functions,
|
||||
task_config=task_config,
|
||||
)
|
||||
|
||||
async def DEPRECATED_job_status(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> Optional[JobStatus]:
|
||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_cancel(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> None:
|
||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
||||
|
||||
async def DEPRECATED_job_result(
|
||||
self,
|
||||
task_id: str,
|
||||
job_id: str,
|
||||
) -> EvaluateResponse:
|
||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue