diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 4bf3d0187..348d8449d 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -97,7 +97,6 @@ class InferenceRouter(Inference): logprobs=logprobs, ) provider = self.routing_table.get_provider_impl(model) - if stream: return (chunk async for chunk in await provider.chat_completion(**params)) else: diff --git a/llama_stack/providers/impls/meta_reference/eval/eval.py b/llama_stack/providers/impls/meta_reference/eval/eval.py index d675e40eb..3aec6170f 100644 --- a/llama_stack/providers/impls/meta_reference/eval/eval.py +++ b/llama_stack/providers/impls/meta_reference/eval/eval.py @@ -18,6 +18,7 @@ from .config import MetaReferenceEvalConfig class ColumnName(Enum): + input_query = "input_query" expected_answer = "expected_answer" chat_completion_input = "chat_completion_input" completion_input = "completion_input" @@ -53,10 +54,12 @@ class MetaReferenceEvalImpl(Eval): expected_schemas = [ { + ColumnName.input_query.value: StringType(), ColumnName.expected_answer.value: StringType(), ColumnName.chat_completion_input.value: ChatCompletionInputType(), }, { + ColumnName.input_query.value: StringType(), ColumnName.expected_answer.value: StringType(), ColumnName.completion_input.value: CompletionInputType(), }, diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index 9bd80f94d..743e191d4 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -70,6 +70,7 @@ async def register_dataset( if for_generation: dataset_schema = { "expected_answer": StringType(), + "input_query": StringType(), "chat_completion_input": ChatCompletionInputType(), } else: diff --git a/llama_stack/providers/tests/eval/provider_config_example.yaml b/llama_stack/providers/tests/eval/provider_config_example.yaml index 1576d2ef0..38f7512f1 100644 --- a/llama_stack/providers/tests/eval/provider_config_example.yaml +++ b/llama_stack/providers/tests/eval/provider_config_example.yaml @@ -16,3 +16,7 @@ providers: provider_type: remote::tgi config: url: http://127.0.0.1:5009 + - provider_id: test-tgi-2 + provider_type: remote::tgi + config: + url: http://127.0.0.1:5010 diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 6b0d99a22..dc6faaffc 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -65,7 +65,10 @@ async def test_eval(eval_settings): model="Llama3.2-1B-Instruct", sampling_params=SamplingParams(), ), - scoring_functions=["subset_of"], + scoring_functions=[ + "meta-reference::subset_of", + "meta-reference::llm_as_judge_8b_correctness", + ], ) assert response.job_id == "0" job_status = await eval_impl.job_status(response.job_id) @@ -74,6 +77,8 @@ async def test_eval(eval_settings): eval_response = await eval_impl.job_result(response.job_id) + print(eval_response) assert eval_response is not None assert len(eval_response.generations) == 5 - assert "subset_of" in eval_response.scores + assert "meta-reference::subset_of" in eval_response.scores + assert "meta-reference::llm_as_judge_8b_correctness" in eval_response.scores