mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
chore!: deprecate eval/tasks (#1186)
# What does this PR do? - Fully deprecate eval/tasks [//]: # (If resolving an issue, uncomment and update the line below) Closes #1088 NOTE: this will be a breaking change. We have introduced the new API in 0.1.3 . Notebook has been updated to use the new endpoints. ## Test Plan ``` pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` <img width="611" alt="image" src="https://github.com/user-attachments/assets/79f6efe1-81ba-494e-bf36-1fc0c2b9bc6f" /> cc @SLR722 for awareness [//]: # (## Documentation)
This commit is contained in:
parent
07ccf908f7
commit
ea1faae50e
8 changed files with 1358 additions and 2161 deletions
2021
docs/_static/llama-stack-spec.html
vendored
2021
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1319
docs/_static/llama-stack-spec.yaml
vendored
1319
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -1017,14 +1017,14 @@
|
||||||
" \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
|
" \"content\": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client.eval_tasks.register(\n",
|
"client.benchmarks.register(\n",
|
||||||
" eval_task_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows_alpha(\n",
|
||||||
" task_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
" input_rows=eval_rows,\n",
|
" input_rows=eval_rows,\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
@ -1196,14 +1196,14 @@
|
||||||
" provider_id=\"together\",\n",
|
" provider_id=\"together\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client.eval_tasks.register(\n",
|
"client.benchmarks.register(\n",
|
||||||
" eval_task_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" dataset_id=simpleqa_dataset_id,\n",
|
" dataset_id=simpleqa_dataset_id,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows_alpha(\n",
|
||||||
" task_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.rows,\n",
|
" input_rows=eval_rows.rows,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
@ -1351,8 +1351,8 @@
|
||||||
" \"enable_session_persistence\": False,\n",
|
" \"enable_session_persistence\": False,\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows(\n",
|
"response = client.eval.evaluate_rows_alpha(\n",
|
||||||
" task_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.rows,\n",
|
" input_rows=eval_rows.rows,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" task_config={\n",
|
" task_config={\n",
|
||||||
|
|
|
@ -64,23 +64,3 @@ class Benchmarks(Protocol):
|
||||||
provider_id: Optional[str] = None,
|
provider_id: Optional[str] = None,
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks", method="GET")
|
|
||||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks/{eval_task_id}", method="GET")
|
|
||||||
async def DEPRECATED_get_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
) -> Optional[Benchmark]: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval-tasks", method="POST")
|
|
||||||
async def DEPRECATED_register_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
dataset_id: str,
|
|
||||||
scoring_functions: List[str],
|
|
||||||
provider_benchmark_id: Optional[str] = None,
|
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None: ...
|
|
||||||
|
|
|
@ -39,7 +39,6 @@ EvalCandidate = register_schema(
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BenchmarkConfig(BaseModel):
|
class BenchmarkConfig(BaseModel):
|
||||||
type: Literal["benchmark"] = "benchmark"
|
|
||||||
eval_candidate: EvalCandidate
|
eval_candidate: EvalCandidate
|
||||||
scoring_params: Dict[str, ScoringFnParams] = Field(
|
scoring_params: Dict[str, ScoringFnParams] = Field(
|
||||||
description="Map between scoring function id and parameters for each scoring function you want to run",
|
description="Map between scoring function id and parameters for each scoring function you want to run",
|
||||||
|
@ -84,28 +83,3 @@ class Eval(Protocol):
|
||||||
|
|
||||||
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
|
||||||
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ...
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs", method="POST")
|
|
||||||
async def DEPRECATED_run_eval(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> Job: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/evaluations", method="POST")
|
|
||||||
async def DEPRECATED_evaluate_rows(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
input_rows: List[Dict[str, Any]],
|
|
||||||
scoring_functions: List[str],
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> EvaluateResponse: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="GET")
|
|
||||||
async def DEPRECATED_job_status(self, task_id: str, job_id: str) -> Optional[JobStatus]: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}", method="DELETE")
|
|
||||||
async def DEPRECATED_job_cancel(self, task_id: str, job_id: str) -> None: ...
|
|
||||||
|
|
||||||
@webmethod(route="/eval/tasks/{task_id}/jobs/{job_id}/result", method="GET")
|
|
||||||
async def DEPRECATED_job_result(self, task_id: str, job_id: str) -> EvaluateResponse: ...
|
|
||||||
|
|
|
@ -411,48 +411,6 @@ class EvalRouter(Eval):
|
||||||
job_id,
|
job_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def DEPRECATED_run_eval(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> Job:
|
|
||||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
|
||||||
|
|
||||||
async def DEPRECATED_evaluate_rows(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
input_rows: List[Dict[str, Any]],
|
|
||||||
scoring_functions: List[str],
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> EvaluateResponse:
|
|
||||||
return await self.evaluate_rows(
|
|
||||||
benchmark_id=task_id,
|
|
||||||
input_rows=input_rows,
|
|
||||||
scoring_functions=scoring_functions,
|
|
||||||
task_config=task_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_status(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> Optional[JobStatus]:
|
|
||||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_cancel(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> None:
|
|
||||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_result(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> EvaluateResponse:
|
|
||||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
||||||
|
|
||||||
class ToolRuntimeRouter(ToolRuntime):
|
class ToolRuntimeRouter(ToolRuntime):
|
||||||
class RagToolImpl(RAGToolRuntime):
|
class RagToolImpl(RAGToolRuntime):
|
||||||
|
|
|
@ -468,35 +468,6 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||||
)
|
)
|
||||||
await self.register_object(benchmark)
|
await self.register_object(benchmark)
|
||||||
|
|
||||||
async def DEPRECATED_list_eval_tasks(self) -> ListBenchmarksResponse:
|
|
||||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
|
||||||
return await self.list_benchmarks()
|
|
||||||
|
|
||||||
async def DEPRECATED_get_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
) -> Optional[Benchmark]:
|
|
||||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
|
||||||
return await self.get_benchmark(eval_task_id)
|
|
||||||
|
|
||||||
async def DEPRECATED_register_eval_task(
|
|
||||||
self,
|
|
||||||
eval_task_id: str,
|
|
||||||
dataset_id: str,
|
|
||||||
scoring_functions: List[str],
|
|
||||||
provider_benchmark_id: Optional[str] = None,
|
|
||||||
provider_id: Optional[str] = None,
|
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
|
||||||
) -> None:
|
|
||||||
logger.warning("DEPRECATED: Use /eval/benchmarks instead")
|
|
||||||
return await self.register_benchmark(
|
|
||||||
benchmark_id=eval_task_id,
|
|
||||||
dataset_id=dataset_id,
|
|
||||||
scoring_functions=scoring_functions,
|
|
||||||
metadata=metadata,
|
|
||||||
provider_benchmark_id=provider_benchmark_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||||
async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:
|
async def list_tools(self, toolgroup_id: Optional[str] = None) -> ListToolsResponse:
|
||||||
|
|
|
@ -234,45 +234,3 @@ class MetaReferenceEvalImpl(
|
||||||
raise ValueError(f"Job is not completed, Status: {status.value}")
|
raise ValueError(f"Job is not completed, Status: {status.value}")
|
||||||
|
|
||||||
return self.jobs[job_id]
|
return self.jobs[job_id]
|
||||||
|
|
||||||
async def DEPRECATED_run_eval(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> Job:
|
|
||||||
return await self.run_eval(benchmark_id=task_id, task_config=task_config)
|
|
||||||
|
|
||||||
async def DEPRECATED_evaluate_rows(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
input_rows: List[Dict[str, Any]],
|
|
||||||
scoring_functions: List[str],
|
|
||||||
task_config: BenchmarkConfig,
|
|
||||||
) -> EvaluateResponse:
|
|
||||||
return await self.evaluate_rows(
|
|
||||||
benchmark_id=task_id,
|
|
||||||
input_rows=input_rows,
|
|
||||||
scoring_functions=scoring_functions,
|
|
||||||
task_config=task_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_status(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> Optional[JobStatus]:
|
|
||||||
return await self.job_status(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_cancel(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> None:
|
|
||||||
return await self.job_cancel(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
||||||
async def DEPRECATED_job_result(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
job_id: str,
|
|
||||||
) -> EvaluateResponse:
|
|
||||||
return await self.job_result(benchmark_id=task_id, job_id=job_id)
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue