From d5b1202c83bb3955bf70eabe7018c03923968f33 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 13 Nov 2024 10:58:12 -0500 Subject: [PATCH] change schema -> dataset_schema (#442) # What does this PR do? - `schema` should not a field w/ pydantic warnings - change `schema` to `dataset_schema` image ## Test Plan ``` pytest -v -s -m meta_reference_eval_together_inference_huggingface_datasetio eval/test_eval.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/apis/datasets/datasets.py | 2 +- llama_stack/distribution/routers/routing_tables.py | 2 +- llama_stack/providers/inline/datasetio/localfs/datasetio.py | 4 ++-- llama_stack/providers/inline/eval/meta_reference/eval.py | 4 ++-- llama_stack/providers/inline/scoring/basic/scoring.py | 6 +++--- .../providers/inline/scoring/braintrust/braintrust.py | 6 +++--- .../providers/inline/scoring/llm_as_judge/scoring.py | 6 +++--- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 2dc74e6ec..8cd94442b 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -17,7 +17,7 @@ from llama_stack.apis.resource import Resource, ResourceType class CommonDatasetFields(BaseModel): - schema: Dict[str, ParamType] + dataset_schema: Dict[str, ParamType] url: URL metadata: Dict[str, Any] = Field( default_factory=dict, diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 5342728b1..c039d3cb1 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -332,7 +332,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): identifier=dataset_id, provider_resource_id=provider_dataset_id, provider_id=provider_id, - schema=schema, + dataset_schema=schema, url=url, metadata=metadata, ) diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index f54905a6b..4de1850ae 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -60,9 +60,9 @@ class PandasDataframeDataset(BaseDataset): def _validate_dataset_schema(self, df) -> pandas.DataFrame: # note that we will drop any columns in dataset that are not in the schema - df = df[self.dataset_def.schema.keys()] + df = df[self.dataset_def.dataset_schema.keys()] # check all columns in dataset schema are present - assert len(df.columns) == len(self.dataset_def.schema) + assert len(df.columns) == len(self.dataset_def.dataset_schema) # TODO: type checking against column types in dataset schema return df diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 58241eb42..35df90788 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -58,7 +58,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.schema or len(dataset_def.schema) == 0: + if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: raise ValueError(f"Dataset {dataset_id} does not have a schema defined.") expected_schemas = [ @@ -74,7 +74,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate): }, ] - if dataset_def.schema not in expected_schemas: + if dataset_def.dataset_schema not in expected_schemas: raise ValueError( f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}" ) diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index 98803ae4a..ac8f8630f 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -60,17 +60,17 @@ class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.schema or len(dataset_def.schema) == 0: + if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: raise ValueError( f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." ) for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.schema: + if required_column not in dataset_def.dataset_schema: raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column." ) - if dataset_def.schema[required_column].type != "string": + if dataset_def.dataset_schema[required_column].type != "string": raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." ) diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index 973232f4e..00817bb33 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -64,17 +64,17 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.schema or len(dataset_def.schema) == 0: + if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: raise ValueError( f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." ) for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.schema: + if required_column not in dataset_def.dataset_schema: raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column." ) - if dataset_def.schema[required_column].type != "string": + if dataset_def.dataset_schema[required_column].type != "string": raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." ) diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py index 0cb81e114..33462631c 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py @@ -67,17 +67,17 @@ class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) - if not dataset_def.schema or len(dataset_def.schema) == 0: + if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: raise ValueError( f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." ) for required_column in ["generated_answer", "expected_answer", "input_query"]: - if required_column not in dataset_def.schema: + if required_column not in dataset_def.dataset_schema: raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column." ) - if dataset_def.schema[required_column].type != "string": + if dataset_def.dataset_schema[required_column].type != "string": raise ValueError( f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." )