forked from phoenix-oss/llama-stack-mirror
# What does this PR do? - add ability to run agents generation for full eval (generate + scoring) - pre-register SimpleQA benchmark llm-as-judge scoring function in code ## Test Plan   #### Simple QA w/ Search  - eval_task_config_simpleqa_search.json ```json { "type": "benchmark", "eval_candidate": { "type": "agent", "config": { "model": "Llama3.1-405B-Instruct", "instructions": "Please use the search tool to answer the question.", "sampling_params": { "strategy": "greedy", "temperature": 1.0, "top_p": 0.9 }, "tools": [ { "type": "brave_search", "engine": "brave", "api_key": "API_KEY" } ], "tool_choice": "auto", "tool_prompt_format": "json", "input_shields": [], "output_shields": [], "enable_session_persistence": false } } } ``` #### SimpleQA w/o Search  ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests.
28 lines
872 B
Python
28 lines
872 B
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from typing import List
|
|
|
|
from llama_stack.distribution.datatypes import * # noqa: F403
|
|
|
|
|
|
def available_providers() -> List[ProviderSpec]:
|
|
return [
|
|
InlineProviderSpec(
|
|
api=Api.eval,
|
|
provider_type="inline::meta-reference",
|
|
pip_packages=[],
|
|
module="llama_stack.providers.inline.eval.meta_reference",
|
|
config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
|
|
api_dependencies=[
|
|
Api.datasetio,
|
|
Api.datasets,
|
|
Api.scoring,
|
|
Api.inference,
|
|
Api.agents,
|
|
],
|
|
),
|
|
]
|