Simplified Telemetry API and tying it to logger (#57)

* Simplified Telemetry API and tying it to logger

* small update which adds a METRIC type

* move span events one level down into structured log events

---------

Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
This commit is contained in:
Ashwin Bharambe 2024-09-11 14:25:37 -07:00 committed by GitHub
parent 1433aaf9f7
commit 191cd28831
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 524 additions and 162 deletions

View file

@ -6,170 +6,126 @@
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Protocol, Union
from typing import Any, Dict, Literal, Optional, Protocol, Union
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from pydantic import BaseModel, Field
from typing_extensions import Annotated
@json_schema_type
class ExperimentStatus(Enum):
NOT_STARTED = "not_started"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
class SpanStatus(Enum):
OK = "ok"
ERROR = "error"
@json_schema_type
class Experiment(BaseModel):
id: str
class Span(BaseModel):
span_id: str
trace_id: str
parent_span_id: Optional[str] = None
name: str
status: ExperimentStatus
created_at: datetime
updated_at: datetime
metadata: Dict[str, Any]
start_time: datetime
end_time: Optional[datetime] = None
attributes: Optional[Dict[str, Any]] = Field(default_factory=dict)
@json_schema_type
class Run(BaseModel):
id: str
experiment_id: str
status: str
started_at: datetime
ended_at: Optional[datetime]
metadata: Dict[str, Any]
class Trace(BaseModel):
trace_id: str
root_span_id: str
start_time: datetime
end_time: Optional[datetime] = None
@json_schema_type
class Metric(BaseModel):
name: str
value: Union[float, int, str, bool]
timestamp: datetime
run_id: str
@json_schema_type
class Log(BaseModel):
message: str
level: str
timestamp: datetime
additional_info: Dict[str, Any]
@json_schema_type
class ArtifactType(Enum):
MODEL = "model"
DATASET = "dataset"
CHECKPOINT = "checkpoint"
PLOT = "plot"
class EventType(Enum):
UNSTRUCTURED_LOG = "unstructured_log"
STRUCTURED_LOG = "structured_log"
METRIC = "metric"
CONFIG = "config"
CODE = "code"
OTHER = "other"
@json_schema_type
class Artifact(BaseModel):
id: str
class LogSeverity(Enum):
VERBOSE = "verbose"
DEBUG = "debug"
INFO = "info"
WARN = "warn"
ERROR = "error"
CRITICAL = "critical"
class EventCommon(BaseModel):
trace_id: str
span_id: str
timestamp: datetime
attributes: Optional[Dict[str, Any]] = Field(default_factory=dict)
@json_schema_type
class UnstructuredLogEvent(EventCommon):
type: Literal[EventType.UNSTRUCTURED_LOG.value] = EventType.UNSTRUCTURED_LOG.value
message: str
severity: LogSeverity
@json_schema_type
class MetricEvent(EventCommon):
type: Literal[EventType.METRIC.value] = EventType.METRIC.value
metric: str # this would be an enum
value: Union[int, float]
unit: str
@json_schema_type
class StructuredLogType(Enum):
SPAN_START = "span_start"
SPAN_END = "span_end"
@json_schema_type
class SpanStartPayload(BaseModel):
type: Literal[StructuredLogType.SPAN_START.value] = (
StructuredLogType.SPAN_START.value
)
name: str
type: ArtifactType
size: int
created_at: datetime
metadata: Dict[str, Any]
parent_span_id: Optional[str] = None
@json_schema_type
class CreateExperimentRequest(BaseModel):
name: str
metadata: Optional[Dict[str, Any]] = None
class SpanEndPayload(BaseModel):
type: Literal[StructuredLogType.SPAN_END.value] = StructuredLogType.SPAN_END.value
status: SpanStatus
StructuredLogPayload = Annotated[
Union[
SpanStartPayload,
SpanEndPayload,
],
Field(discriminator="type"),
]
@json_schema_type
class UpdateExperimentRequest(BaseModel):
experiment_id: str
status: Optional[ExperimentStatus] = None
metadata: Optional[Dict[str, Any]] = None
class StructuredLogEvent(EventCommon):
type: Literal[EventType.STRUCTURED_LOG.value] = EventType.STRUCTURED_LOG.value
payload: StructuredLogPayload
@json_schema_type
class CreateRunRequest(BaseModel):
experiment_id: str
metadata: Optional[Dict[str, Any]] = None
@json_schema_type
class UpdateRunRequest(BaseModel):
run_id: str
status: Optional[str] = None
ended_at: Optional[datetime] = None
metadata: Optional[Dict[str, Any]] = None
@json_schema_type
class LogMetricsRequest(BaseModel):
run_id: str
metrics: List[Metric]
@json_schema_type
class LogMessagesRequest(BaseModel):
logs: List[Log]
run_id: Optional[str] = None
@json_schema_type
class UploadArtifactRequest(BaseModel):
experiment_id: str
name: str
artifact_type: str
content: bytes
metadata: Optional[Dict[str, Any]] = None
@json_schema_type
class LogSearchRequest(BaseModel):
query: str
filters: Optional[Dict[str, Any]] = None
Event = Annotated[
Union[
UnstructuredLogEvent,
MetricEvent,
StructuredLogEvent,
],
Field(discriminator="type"),
]
class Telemetry(Protocol):
@webmethod(route="/experiments/create")
def create_experiment(self, request: CreateExperimentRequest) -> Experiment: ...
@webmethod(route="/telemetry/log_event")
async def log_event(self, event: Event): ...
@webmethod(route="/experiments/list")
def list_experiments(self) -> List[Experiment]: ...
@webmethod(route="/experiments/get")
def get_experiment(self, experiment_id: str) -> Experiment: ...
@webmethod(route="/experiments/update")
def update_experiment(self, request: UpdateExperimentRequest) -> Experiment: ...
@webmethod(route="/experiments/create_run")
def create_run(self, request: CreateRunRequest) -> Run: ...
@webmethod(route="/runs/update")
def update_run(self, request: UpdateRunRequest) -> Run: ...
@webmethod(route="/runs/log_metrics")
def log_metrics(self, request: LogMetricsRequest) -> None: ...
@webmethod(route="/runs/metrics", method="GET")
def get_metrics(self, run_id: str) -> List[Metric]: ...
@webmethod(route="/logging/log_messages")
def log_messages(self, request: LogMessagesRequest) -> None: ...
@webmethod(route="/logging/get_logs")
def get_logs(self, request: LogSearchRequest) -> List[Log]: ...
@webmethod(route="/experiments/artifacts/upload")
def upload_artifact(self, request: UploadArtifactRequest) -> Artifact: ...
@webmethod(route="/experiments/artifacts/get")
def list_artifacts(self, experiment_id: str) -> List[Artifact]: ...
@webmethod(route="/artifacts/get")
def get_artifact(self, artifact_id: str) -> Artifact: ...
@webmethod(route="/telemetry/get_trace", method="GET")
async def get_trace(self, trace_id: str) -> Trace: ...