From e9a37bad6308e22090126c982d09d1205458d0a5 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 4 Mar 2025 12:44:04 -0800 Subject: [PATCH] chore: rename task_config to benchmark_config (#1397) # What does this PR do? - This was missed from previous deprecation: https://github.com/meta-llama/llama-stack/pull/1186 - Part of https://github.com/meta-llama/llama-stack/issues/1396 [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` pytest -v -s --nbval-lax ./llama-stack/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb ``` [//]: # (## Documentation) --- docs/_static/llama-stack-spec.html | 8 +++--- docs/_static/llama-stack-spec.yaml | 8 +++--- .../Alpha_Llama_Stack_Post_Training.ipynb | 4 +-- .../Llama_Stack_Benchmark_Evals.ipynb | 6 ++--- docs/source/building_applications/evals.md | 6 ++--- .../building_applications/evaluation.md | 2 +- .../references/evals_reference/index.md | 6 ++--- llama_stack/apis/eval/eval.py | 4 +-- llama_stack/distribution/routers/routers.py | 23 +++++++++++----- .../ui/page/evaluations/native_eval.py | 2 +- .../inline/eval/meta_reference/eval.py | 26 +++++++++---------- llama_stack/providers/tests/eval/test_eval.py | 6 ++--- 12 files changed, 55 insertions(+), 46 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index aeb350ce0..643e1faee 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6355,7 +6355,7 @@ "type": "string" } }, - "task_config": { + "benchmark_config": { "$ref": "#/components/schemas/BenchmarkConfig" } }, @@ -6363,7 +6363,7 @@ "required": [ "input_rows", "scoring_functions", - "task_config" + "benchmark_config" ], "title": "EvaluateRowsRequest" }, @@ -9248,13 +9248,13 @@ "RunEvalRequest": { "type": "object", "properties": { - "task_config": { + "benchmark_config": { "$ref": "#/components/schemas/BenchmarkConfig" } }, "additionalProperties": false, "required": [ - "task_config" + "benchmark_config" ], "title": "RunEvalRequest" }, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index f3410aa7d..eb31b61fb 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4357,13 +4357,13 @@ components: type: array items: type: string - task_config: + benchmark_config: $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - input_rows - scoring_functions - - task_config + - benchmark_config title: EvaluateRowsRequest EvaluateResponse: type: object @@ -6168,11 +6168,11 @@ components: RunEvalRequest: type: object properties: - task_config: + benchmark_config: $ref: '#/components/schemas/BenchmarkConfig' additionalProperties: false required: - - task_config + - benchmark_config title: RunEvalRequest Job: type: object diff --git a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb index ae50b95a1..1cea5d0ef 100644 --- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb +++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb @@ -3675,7 +3675,7 @@ " benchmark_id=\"llama3.2-3B-instruct:tax_eval\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"braintrust::answer-similarity\"],\n", - " task_config={\n", + " benchmark_config={\n", " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", @@ -6383,7 +6383,7 @@ " benchmark_id=\"Llama-3.2-3B-Instruct-sft-0:tax_eval\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"braintrust::answer-similarity\"],\n", - " task_config={\n", + " benchmark_config={\n", " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", diff --git a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb index 174cbcce6..8f0c84294 100644 --- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb +++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb @@ -781,7 +781,7 @@ " benchmark_id=\"meta-reference::mmmu\",\n", " input_rows=eval_rows,\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", - " task_config={\n", + " benchmark_config={\n", " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", @@ -960,7 +960,7 @@ " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", - " task_config={\n", + " benchmark_config={\n", " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"model\",\n", @@ -1109,7 +1109,7 @@ " benchmark_id=\"meta-reference::simpleqa\",\n", " input_rows=eval_rows.rows,\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", - " task_config={\n", + " benchmark_config={\n", " \"type\": \"benchmark\",\n", " \"eval_candidate\": {\n", " \"type\": \"agent\",\n", diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md index 8106c0dd5..c54536897 100644 --- a/docs/source/building_applications/evals.md +++ b/docs/source/building_applications/evals.md @@ -51,7 +51,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "model", @@ -109,7 +109,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "model", @@ -158,7 +158,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "agent", diff --git a/docs/source/building_applications/evaluation.md b/docs/source/building_applications/evaluation.md index ad220f751..981771862 100644 --- a/docs/source/building_applications/evaluation.md +++ b/docs/source/building_applications/evaluation.md @@ -19,7 +19,7 @@ response = client.benchmarks.register( # Run evaluation job = client.eval.run_eval( benchmark_id="my_eval", - task_config={ + benchmark_config={ "type": "app", "eval_candidate": {"type": "agent", "config": agent_config}, }, diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md index 71dbb47e5..d4cf2e20e 100644 --- a/docs/source/references/evals_reference/index.md +++ b/docs/source/references/evals_reference/index.md @@ -87,7 +87,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::mmmu", input_rows=eval_rows, scoring_functions=["basic::regex_parser_multiple_choice_answer"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "model", @@ -145,7 +145,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "model", @@ -195,7 +195,7 @@ response = client.eval.evaluate_rows( benchmark_id="meta-reference::simpleqa", input_rows=eval_rows.rows, scoring_functions=["llm-as-judge::405b-simpleqa"], - task_config={ + benchmark_config={ "type": "benchmark", "eval_candidate": { "type": "agent", diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py index a7b2e7670..40a3b750a 100644 --- a/llama_stack/apis/eval/eval.py +++ b/llama_stack/apis/eval/eval.py @@ -63,7 +63,7 @@ class Eval(Protocol): async def run_eval( self, benchmark_id: str, - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> Job: ... @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") @@ -72,7 +72,7 @@ class Eval(Protocol): benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> EvaluateResponse: ... @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 350c3c997..691df1988 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -81,7 +81,10 @@ class VectorIORouter(VectorIO): provider_id: Optional[str] = None, provider_vector_db_id: Optional[str] = None, ) -> None: - logcat.debug("core", f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}") + logcat.debug( + "core", + f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}", + ) await self.routing_table.register_vector_db( vector_db_id, embedding_model, @@ -328,7 +331,10 @@ class DatasetIORouter(DatasetIO): page_token: Optional[str] = None, filter_condition: Optional[str] = None, ) -> PaginatedRowsResult: - logcat.debug("core", f"DatasetIORouter.get_rows_paginated: {dataset_id}, rows_in_page={rows_in_page}") + logcat.debug( + "core", + f"DatasetIORouter.get_rows_paginated: {dataset_id}, rows_in_page={rows_in_page}", + ) return await self.routing_table.get_provider_impl(dataset_id).get_rows_paginated( dataset_id=dataset_id, rows_in_page=rows_in_page, @@ -387,7 +393,10 @@ class ScoringRouter(Scoring): input_rows: List[Dict[str, Any]], scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, ) -> ScoreResponse: - logcat.debug("core", f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions") + logcat.debug( + "core", + f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions", + ) res = {} # look up and map each scoring function to its provider impl for fn_identifier in scoring_functions.keys(): @@ -419,12 +428,12 @@ class EvalRouter(Eval): async def run_eval( self, benchmark_id: str, - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> Job: logcat.debug("core", f"EvalRouter.run_eval: {benchmark_id}") return await self.routing_table.get_provider_impl(benchmark_id).run_eval( benchmark_id=benchmark_id, - task_config=task_config, + benchmark_config=benchmark_config, ) async def evaluate_rows( @@ -432,14 +441,14 @@ class EvalRouter(Eval): benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> EvaluateResponse: logcat.debug("core", f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows") return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( benchmark_id=benchmark_id, input_rows=input_rows, scoring_functions=scoring_functions, - task_config=task_config, + benchmark_config=benchmark_config, ) async def job_status( diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py index f1cae714a..00e949ed6 100644 --- a/llama_stack/distribution/ui/page/evaluations/native_eval.py +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -212,7 +212,7 @@ def run_evaluation_3(): benchmark_id=selected_benchmark, input_rows=[r], scoring_functions=benchmarks[selected_benchmark].scoring_functions, - task_config=benchmark_config, + benchmark_config=benchmark_config, ) for k in r.keys(): diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index a01f7f1f3..a1bebaa4c 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -83,7 +83,7 @@ class MetaReferenceEvalImpl( async def run_eval( self, benchmark_id: str, - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> Job: task_def = self.benchmarks[benchmark_id] dataset_id = task_def.dataset_id @@ -92,13 +92,13 @@ class MetaReferenceEvalImpl( validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value)) all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, - rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples), + rows_in_page=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples), ) res = await self.evaluate_rows( benchmark_id=benchmark_id, input_rows=all_rows.rows, scoring_functions=scoring_functions, - task_config=task_config, + benchmark_config=benchmark_config, ) # TODO: currently needs to wait for generation before returning @@ -108,9 +108,9 @@ class MetaReferenceEvalImpl( return Job(job_id=job_id) async def _run_agent_generation( - self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig + self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig ) -> List[Dict[str, Any]]: - candidate = task_config.eval_candidate + candidate = benchmark_config.eval_candidate create_response = await self.agents_api.create_agent(candidate.config) agent_id = create_response.agent_id @@ -151,9 +151,9 @@ class MetaReferenceEvalImpl( return generations async def _run_model_generation( - self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig + self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig ) -> List[Dict[str, Any]]: - candidate = task_config.eval_candidate + candidate = benchmark_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" generations = [] @@ -189,13 +189,13 @@ class MetaReferenceEvalImpl( benchmark_id: str, input_rows: List[Dict[str, Any]], scoring_functions: List[str], - task_config: BenchmarkConfig, + benchmark_config: BenchmarkConfig, ) -> EvaluateResponse: - candidate = task_config.eval_candidate + candidate = benchmark_config.eval_candidate if candidate.type == "agent": - generations = await self._run_agent_generation(input_rows, task_config) + generations = await self._run_agent_generation(input_rows, benchmark_config) elif candidate.type == "model": - generations = await self._run_model_generation(input_rows, task_config) + generations = await self._run_model_generation(input_rows, benchmark_config) else: raise ValueError(f"Invalid candidate type: {candidate.type}") @@ -204,9 +204,9 @@ class MetaReferenceEvalImpl( input_r | generated_r for input_r, generated_r in zip(input_rows, generations, strict=False) ] - if task_config.scoring_params is not None: + if benchmark_config.scoring_params is not None: scoring_functions_dict = { - scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None) + scoring_fn_id: benchmark_config.scoring_params.get(scoring_fn_id, None) for scoring_fn_id in scoring_functions } else: diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 9ce3a972b..4470ffe4c 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -68,7 +68,7 @@ class Testeval: benchmark_id=benchmark_id, input_rows=rows.rows, scoring_functions=scoring_functions, - task_config=AppBenchmarkConfig( + benchmark_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -111,7 +111,7 @@ class Testeval: ) response = await eval_impl.run_eval( benchmark_id=benchmark_id, - task_config=AppBenchmarkConfig( + benchmark_config=AppBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(), @@ -169,7 +169,7 @@ class Testeval: benchmark_id = "meta-reference-mmlu" response = await eval_impl.run_eval( benchmark_id=benchmark_id, - task_config=BenchmarkBenchmarkConfig( + benchmark_config=BenchmarkBenchmarkConfig( eval_candidate=ModelCandidate( model=inference_model, sampling_params=SamplingParams(),