forked from phoenix-oss/llama-stack-mirror
[Evals API][6/n] meta-reference llm as judge, registration for ScoringFnDefs (#330)
* wip scoring refactor * llm as judge, move folders * test full generation + eval * extract score regex to llm context * remove prints, cleanup braintrust in this branch * change json -> class * remove initialize * address nits * check identifier prefix * udpate MANIFEST
This commit is contained in:
parent
04a4784287
commit
7b8748c53e
20 changed files with 360 additions and 50 deletions
|
@ -65,7 +65,10 @@ async def test_eval(eval_settings):
|
|||
model="Llama3.2-1B-Instruct",
|
||||
sampling_params=SamplingParams(),
|
||||
),
|
||||
scoring_functions=["subset_of"],
|
||||
scoring_functions=[
|
||||
"meta-reference::subset_of",
|
||||
"meta-reference::llm_as_judge_8b_correctness",
|
||||
],
|
||||
)
|
||||
assert response.job_id == "0"
|
||||
job_status = await eval_impl.job_status(response.job_id)
|
||||
|
@ -76,4 +79,5 @@ async def test_eval(eval_settings):
|
|||
|
||||
assert eval_response is not None
|
||||
assert len(eval_response.generations) == 5
|
||||
assert "subset_of" in eval_response.scores
|
||||
assert "meta-reference::subset_of" in eval_response.scores
|
||||
assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue