mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
test: revamp eval related integration tests (#1433)
# What does this PR do? - revamp and clean up datasets/scoring/eval integration tests - closes https://github.com/meta-llama/llama-stack/issues/1396 [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan **dataset** ``` LLAMA_STACK_BASE_URL=http://localhost:8321 pytest -v tests/integration/datasetio/ ``` <img width="842" alt="image" src="https://github.com/user-attachments/assets/88fc2b6a-b496-47bf-bc0c-8fea48ba36ff" /> **scoring** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="851" alt="image" src="https://github.com/user-attachments/assets/50f46415-b44c-4c37-a6c3-076f2767adb3" /> **eval** ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/eval --text-model meta-llama/Llama-3.1-8B-Instruct --judge-model meta-llama/Llama-3.1-8B-Instruct ``` <img width="841" alt="image" src="https://github.com/user-attachments/assets/8eb1c65c-3b39-4d66-8ff4-f471ca783e49" /> [//]: # (## Documentation)
This commit is contained in:
parent
82e94fe22f
commit
bcb13c492f
7 changed files with 184 additions and 222 deletions
|
@ -6,7 +6,7 @@
|
|||
import re
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from llama_stack.apis.inference.inference import Inference
|
||||
from llama_stack.apis.inference.inference import Inference, UserMessage
|
||||
from llama_stack.apis.scoring import ScoringResultRow
|
||||
from llama_stack.apis.scoring_functions import ScoringFnParams
|
||||
from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
|
||||
|
@ -58,10 +58,9 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
|
|||
judge_response = await self.inference_api.chat_completion(
|
||||
model_id=fn_def.params.judge_model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": judge_input_msg,
|
||||
}
|
||||
UserMessage(
|
||||
content=judge_input_msg,
|
||||
),
|
||||
],
|
||||
)
|
||||
content = judge_response.completion_message.content
|
||||
|
|
|
@ -73,6 +73,11 @@ class RegisteredBaseScoringFn(BaseScoringFn):
|
|||
raise ValueError(f"Scoring function def with identifier {scoring_fn.identifier} already exists.")
|
||||
self.supported_fn_defs_registry[scoring_fn.identifier] = scoring_fn
|
||||
|
||||
def unregister_scoring_fn_def(self, scoring_fn_id: str) -> None:
|
||||
if scoring_fn_id not in self.supported_fn_defs_registry:
|
||||
raise ValueError(f"Scoring function def with identifier {scoring_fn_id} does not exist.")
|
||||
del self.supported_fn_defs_registry[scoring_fn_id]
|
||||
|
||||
@abstractmethod
|
||||
async def score_row(
|
||||
self,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue