[Evals API][6/n] meta-reference llm as judge, registration for ScoringFnDefs (#330)

* wip scoring refactor

* llm as judge, move folders

* test full generation + eval

* extract score regex to llm context

* remove prints, cleanup braintrust in this branch

* change json -> class

* remove initialize

* address nits

* check identifier prefix

* udpate MANIFEST
This commit is contained in:
Xi Yan 2024-10-28 14:08:42 -07:00 committed by GitHub
parent 04a4784287
commit 7b8748c53e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 360 additions and 50 deletions

View file

@ -65,7 +65,10 @@ async def test_eval(eval_settings):
model="Llama3.2-1B-Instruct",
sampling_params=SamplingParams(),
),
scoring_functions=["subset_of"],
scoring_functions=[
"meta-reference::subset_of",
"meta-reference::llm_as_judge_8b_correctness",
],
)
assert response.job_id == "0"
job_status = await eval_impl.job_status(response.job_id)
@ -76,4 +79,5 @@ async def test_eval(eval_settings):
assert eval_response is not None
assert len(eval_response.generations) == 5
assert "subset_of" in eval_response.scores
assert "meta-reference::subset_of" in eval_response.scores
assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores