change schema -> dataset_schema (#442)

# What does this PR do?

- `schema` should not a field w/ pydantic warnings
- change `schema` to `dataset_schema`

<img width="855" alt="image"
src="https://github.com/user-attachments/assets/47cb6bb9-4be0-46a5-8701-24d24e2eaabd">


## Test Plan

```
pytest -v -s -m meta_reference_eval_together_inference_huggingface_datasetio eval/test_eval.py
```


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
This commit is contained in:
Xi Yan 2024-11-13 10:58:12 -05:00 committed by GitHub
parent c29fa56dde
commit d5b1202c83
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 15 additions and 15 deletions

View file

@ -17,7 +17,7 @@ from llama_stack.apis.resource import Resource, ResourceType
class CommonDatasetFields(BaseModel): class CommonDatasetFields(BaseModel):
schema: Dict[str, ParamType] dataset_schema: Dict[str, ParamType]
url: URL url: URL
metadata: Dict[str, Any] = Field( metadata: Dict[str, Any] = Field(
default_factory=dict, default_factory=dict,

View file

@ -332,7 +332,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
identifier=dataset_id, identifier=dataset_id,
provider_resource_id=provider_dataset_id, provider_resource_id=provider_dataset_id,
provider_id=provider_id, provider_id=provider_id,
schema=schema, dataset_schema=schema,
url=url, url=url,
metadata=metadata, metadata=metadata,
) )

View file

@ -60,9 +60,9 @@ class PandasDataframeDataset(BaseDataset):
def _validate_dataset_schema(self, df) -> pandas.DataFrame: def _validate_dataset_schema(self, df) -> pandas.DataFrame:
# note that we will drop any columns in dataset that are not in the schema # note that we will drop any columns in dataset that are not in the schema
df = df[self.dataset_def.schema.keys()] df = df[self.dataset_def.dataset_schema.keys()]
# check all columns in dataset schema are present # check all columns in dataset schema are present
assert len(df.columns) == len(self.dataset_def.schema) assert len(df.columns) == len(self.dataset_def.dataset_schema)
# TODO: type checking against column types in dataset schema # TODO: type checking against column types in dataset schema
return df return df

View file

@ -58,7 +58,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
if not dataset_def.schema or len(dataset_def.schema) == 0: if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
raise ValueError(f"Dataset {dataset_id} does not have a schema defined.") raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
expected_schemas = [ expected_schemas = [
@ -74,7 +74,7 @@ class MetaReferenceEvalImpl(Eval, EvalTasksProtocolPrivate):
}, },
] ]
if dataset_def.schema not in expected_schemas: if dataset_def.dataset_schema not in expected_schemas:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}" f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}"
) )

View file

@ -60,17 +60,17 @@ class BasicScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None:
dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
if not dataset_def.schema or len(dataset_def.schema) == 0: if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
) )
for required_column in ["generated_answer", "expected_answer", "input_query"]: for required_column in ["generated_answer", "expected_answer", "input_query"]:
if required_column not in dataset_def.schema: if required_column not in dataset_def.dataset_schema:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column." f"Dataset {dataset_id} does not have a '{required_column}' column."
) )
if dataset_def.schema[required_column].type != "string": if dataset_def.dataset_schema[required_column].type != "string":
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'."
) )

View file

@ -64,17 +64,17 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None:
dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
if not dataset_def.schema or len(dataset_def.schema) == 0: if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
) )
for required_column in ["generated_answer", "expected_answer", "input_query"]: for required_column in ["generated_answer", "expected_answer", "input_query"]:
if required_column not in dataset_def.schema: if required_column not in dataset_def.dataset_schema:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column." f"Dataset {dataset_id} does not have a '{required_column}' column."
) )
if dataset_def.schema[required_column].type != "string": if dataset_def.dataset_schema[required_column].type != "string":
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'."
) )

View file

@ -67,17 +67,17 @@ class LlmAsJudgeScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None: async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None:
dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id) dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
if not dataset_def.schema or len(dataset_def.schema) == 0: if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
) )
for required_column in ["generated_answer", "expected_answer", "input_query"]: for required_column in ["generated_answer", "expected_answer", "input_query"]:
if required_column not in dataset_def.schema: if required_column not in dataset_def.dataset_schema:
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column." f"Dataset {dataset_id} does not have a '{required_column}' column."
) )
if dataset_def.schema[required_column].type != "string": if dataset_def.dataset_schema[required_column].type != "string":
raise ValueError( raise ValueError(
f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'."
) )