diff --git a/llama_stack/apis/evals/evals.py b/llama_stack/apis/evals/evals.py index fb3aa6cd4..ea985ad3b 100644 --- a/llama_stack/apis/evals/evals.py +++ b/llama_stack/apis/evals/evals.py @@ -89,15 +89,21 @@ class EvaluatePostprocessConfig(BaseModel): kwargs: Optional[Dict[str, Any]] = None +@json_schema_type +class EvaluateProcessorConfig(BaseModel): + processor_identifier: str + preprocess_config: Optional[EvaluatePreprocessConfig] = None + postprocess_config: Optional[EvaluatePostprocessConfig] = None + + @json_schema_type class EvaluateJudgeScoringConfig(BaseModel): ... @json_schema_type class LLMJudgeConfig(BaseModel): - judge_preprocess_config: EvaluatePreprocessConfig + judge_processor_config: EvaluateProcessorConfig judge_model_generation_config: EvaluateModelGenerationConfig - judge_postprocess_config: EvaluatePostprocessConfig judge_scoring_config: EvaluateJudgeScoringConfig @@ -116,9 +122,8 @@ class EvaluateScoringConfig(BaseModel): @json_schema_type class EvaluateTaskConfig(BaseModel): dataset_config: EvaluateDatasetConfig - preprocess_config: Optional[EvaluatePreprocessConfig] = None + processor_config: EvaluateProcessorConfig generation_config: EvaluateModelGenerationConfig - postprocess_config: Optional[EvaluatePostprocessConfig] = None scoring_config: EvaluateScoringConfig diff --git a/llama_stack/distribution/registry/generator_processors/__init__.py b/llama_stack/distribution/registry/generator_processors/__init__.py index bb9d5c182..44972cf03 100644 --- a/llama_stack/distribution/registry/generator_processors/__init__.py +++ b/llama_stack/distribution/registry/generator_processors/__init__.py @@ -4,9 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.processor import * # noqa: F403 from ..registry import Registry +# TODO: decide whether we should group dataset+processor together via Tasks +GeneratorProcessorRegistry = Registry[BaseGeneratorProcessor]() -class GeneratorProcessorRegistry(Registry[BaseGeneratorProcessor]): - _REGISTRY: Dict[str, BaseGeneratorProcessor] = {} +PROCESSOR_REGISTRY = { + "mmlu": MMLUProcessor, +} + +for k, v in PROCESSOR_REGISTRY.items(): + GeneratorProcessorRegistry.register(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index abd1938ad..80bf2dd7a 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -47,6 +47,9 @@ class MetaReferenceEvalsImpl(Evals): dataset_name=dataset, row_limit=3, ), + processor_config=EvaluateProcessorConfig( + processor_identifier="mmlu", + ), generation_config=EvaluateModelGenerationConfig( model=model, ), diff --git a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py index 756f351d8..f782f9320 100644 --- a/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py +++ b/llama_stack/providers/impls/meta_reference/evals/processor/__init__.py @@ -3,3 +3,4 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from .mmlu_processor import MMLUProcessor # noqa: F401 diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py index 756f351d8..6424963f8 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/__init__.py @@ -3,3 +3,5 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from .basic_scorers import * # noqa: F401 F403 +from .aggregate_scorer import * # noqa: F401 F403 diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index 48c450914..83f6264c0 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -4,15 +4,17 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.generator_processors import ( + GeneratorProcessorRegistry, +) from llama_stack.distribution.registry.scorers import ScorerRegistry + from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( InferenceGenerator, ) -from llama_stack.providers.impls.meta_reference.evals.processor.mmlu_processor import ( - MMLUProcessor, -) + from llama_stack.apis.evals import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 @@ -46,7 +48,10 @@ class RunEvalTask(BaseTask): print(f"Running on {len(dataset)} samples") # F1 - processor = MMLUProcessor() + print(GeneratorProcessorRegistry.names()) + processor = GeneratorProcessorRegistry.get( + eval_task_config.processor_config.processor_identifier + )() preprocessed = processor.preprocess(dataset) # Generation