feat(api): (1/n) datasets api clean up (#1573)

## PR Stack
- https://github.com/meta-llama/llama-stack/pull/1573
- https://github.com/meta-llama/llama-stack/pull/1625
- https://github.com/meta-llama/llama-stack/pull/1656
- https://github.com/meta-llama/llama-stack/pull/1657
- https://github.com/meta-llama/llama-stack/pull/1658
- https://github.com/meta-llama/llama-stack/pull/1659
- https://github.com/meta-llama/llama-stack/pull/1660

**Client SDK**
- https://github.com/meta-llama/llama-stack-client-python/pull/203

**CI**
- 1391130488
<img width="1042" alt="image"
src="https://github.com/user-attachments/assets/69636067-376d-436b-9204-896e2dd490ca"
/>
-- the test_rag_agent_with_attachments is flaky and not related to this
PR

## Doc
<img width="789" alt="image"
src="https://github.com/user-attachments/assets/b88390f3-73d6-4483-b09a-a192064e32d9"
/>


## Client Usage
```python
client.datasets.register(
    source={
        "type": "uri",
        "uri": "lsfs://mydata.jsonl",
    },
    schema="jsonl_messages",
    # optional 
    dataset_id="my_first_train_data"
)

# quick prototype debugging
client.datasets.register(
    data_reference={
        "type": "rows",
        "rows": [
                "messages": [...],
        ],
    },
    schema="jsonl_messages",
)
```

## Test Plan
- CI:
1387805545

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/datasets/test_datasets.py
```

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring/test_scoring.py
```

```
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
```
This commit is contained in:
Xi Yan 2025-03-17 16:55:45 -07:00 committed by GitHub
parent 3b35a39b8b
commit 5287b437ae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 2593 additions and 2296 deletions

View file

@ -435,7 +435,7 @@ class Generator:
)
self.schema_builder = SchemaBuilder(schema_generator)
self.responses = {}
# Create standard error responses
self._create_standard_error_responses()
@ -446,7 +446,7 @@ class Generator:
"""
# Get the Error schema
error_schema = self.schema_builder.classdef_to_ref(Error)
# Create standard error responses
self.responses["BadRequest400"] = Response(
description="The request was invalid or malformed",
@ -457,11 +457,11 @@ class Generator:
"status": 400,
"title": "Bad Request",
"detail": "The request was invalid or malformed",
}
},
)
}
},
)
self.responses["TooManyRequests429"] = Response(
description="The client has sent too many requests in a given amount of time",
content={
@ -471,11 +471,11 @@ class Generator:
"status": 429,
"title": "Too Many Requests",
"detail": "You have exceeded the rate limit. Please try again later.",
}
},
)
}
},
)
self.responses["InternalServerError500"] = Response(
description="The server encountered an unexpected error",
content={
@ -485,11 +485,11 @@ class Generator:
"status": 500,
"title": "Internal Server Error",
"detail": "An unexpected error occurred. Our team has been notified.",
}
},
)
}
},
)
# Add a default error response for any unhandled error cases
self.responses["DefaultError"] = Response(
description="An unexpected error occurred",
@ -500,9 +500,9 @@ class Generator:
"status": 0,
"title": "Error",
"detail": "An unexpected error occurred",
}
},
)
}
},
)
def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
@ -547,11 +547,14 @@ class Generator:
"SyntheticDataGeneration",
"PostTraining",
"BatchInference",
"Files",
]:
op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
print(op.defining_class.__name__)
# TODO (xiyan): temporary fix for datasetio inner impl + datasets api
# if op.defining_class.__name__ in ["DatasetIO"]:
# op.defining_class.__name__ = "Datasets"
doc_string = parse_type(op.func_ref)
doc_params = dict(
(param.name, param.description) for param in doc_string.params.values()
@ -598,7 +601,9 @@ class Generator:
# data passed in request body as raw bytes cannot have request parameters
if raw_bytes_request_body and op.request_params:
raise ValueError("Cannot have both raw bytes request body and request parameters")
raise ValueError(
"Cannot have both raw bytes request body and request parameters"
)
# data passed in request body as raw bytes
if raw_bytes_request_body:
@ -719,7 +724,7 @@ class Generator:
responses.update(response_builder.build_response(response_options))
assert len(responses.keys()) > 0, f"No responses found for {op.name}"
# Add standard error response references
if self.options.include_standard_error_responses:
if "400" not in responses:
@ -730,7 +735,7 @@ class Generator:
responses["500"] = ResponseRef("InternalServerError500")
if "default" not in responses:
responses["default"] = ResponseRef("DefaultError")
if op.event_type is not None:
builder = ContentBuilder(self.schema_builder)
callbacks = {