This commit is contained in:
Xi Yan 2025-03-19 10:10:02 -07:00
parent d1b44c1251
commit 443b18a992
5 changed files with 7 additions and 127 deletions

View file

@ -7681,73 +7681,6 @@
"title": "EvaluationResponse", "title": "EvaluationResponse",
"description": "A response to an inline evaluation." "description": "A response to an inline evaluation."
}, },
"ScoringResult": {
"type": "object",
"properties": {
"scores": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "The scoring result for each row. Each row is a map of grader column name to value."
},
"metrics": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "Map of metric name to aggregated value."
}
},
"additionalProperties": false,
"required": [
"scores",
"metrics"
],
"title": "ScoringResult",
"description": "A scoring result for a single row."
},
"HealthInfo": { "HealthInfo": {
"type": "object", "type": "object",
"properties": { "properties": {

View file

@ -5359,41 +5359,6 @@ components:
- grades - grades
title: EvaluationResponse title: EvaluationResponse
description: A response to an inline evaluation. description: A response to an inline evaluation.
ScoringResult:
type: object
properties:
scores:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
The scoring result for each row. Each row is a map of grader column name
to value.
metrics:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: Map of metric name to aggregated value.
additionalProperties: false
required:
- scores
- metrics
title: ScoringResult
description: A scoring result for a single row.
HealthInfo: HealthInfo:
type: object type: object
properties: properties:

View file

@ -51,14 +51,8 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
def providable_apis() -> List[Api]: def providable_apis() -> List[Api]:
routing_table_apis = { routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
x.routing_table_api for x in builtin_automatically_routed_apis() return [api for api in Api if api not in routing_table_apis and api not in [Api.inspect, Api.providers]]
}
return [
api
for api in Api
if api not in routing_table_apis and api not in [Api.inspect, Api.providers]
]
def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:

View file

@ -22,9 +22,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
purpose="eval/messages-answer", purpose="eval/messages-answer",
source={ source={
"type": "uri", "type": "uri",
"uri": data_url_from_file( "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
Path(__file__).parent.parent / "datasets" / "test_dataset.csv"
),
}, },
) )
response = llama_stack_client.datasets.list() response = llama_stack_client.datasets.list()
@ -74,9 +72,7 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
purpose="eval/messages-answer", purpose="eval/messages-answer",
source={ source={
"type": "uri", "type": "uri",
"uri": data_url_from_file( "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
Path(__file__).parent.parent / "datasets" / "test_dataset.csv"
),
}, },
) )
benchmark_id = str(uuid.uuid4()) benchmark_id = str(uuid.uuid4())
@ -99,14 +95,10 @@ def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
}, },
) )
assert response.job_id == "0" assert response.job_id == "0"
job_status = llama_stack_client.eval.jobs.status( job_status = llama_stack_client.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id)
job_id=response.job_id, benchmark_id=benchmark_id
)
assert job_status and job_status == "completed" assert job_status and job_status == "completed"
eval_response = llama_stack_client.eval.jobs.retrieve( eval_response = llama_stack_client.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id)
job_id=response.job_id, benchmark_id=benchmark_id
)
assert eval_response is not None assert eval_response is not None
assert len(eval_response.generations) == 5 assert len(eval_response.generations) == 5
assert scoring_fn_id in eval_response.scores assert scoring_fn_id in eval_response.scores

View file

@ -154,11 +154,7 @@ def test_scoring_score_with_aggregation_functions(
df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv") df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
rows = df.to_dict(orient="records") rows = df.to_dict(orient="records")
scoring_fns_list = [ scoring_fns_list = [x for x in llama_stack_client.scoring_functions.list() if x.provider_id == provider_id]
x
for x in llama_stack_client.scoring_functions.list()
if x.provider_id == provider_id
]
if len(scoring_fns_list) == 0: if len(scoring_fns_list) == 0:
pytest.skip(f"No scoring functions found for provider {provider_id}, skipping") pytest.skip(f"No scoring functions found for provider {provider_id}, skipping")