llama-stack-mirror/source/api_definitions.py

from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Protocol, Set, Union

import yaml

from model_types import (
    BuiltinTool,
    Content,
    InstructModel,
    Message,
    PretrainedModel,
    SamplingParams,
    StopReason,
    ToolCall,
    ToolDefinition,
    ToolResponse,
)
from agentic_system_types import (
    AgenticSystemTurn,
)

from pyopenapi import Info, Options, Server, Specification, webmethod
from strong_typing.schema import json_schema_type


@json_schema_type
@dataclass
class CompletionRequest:
    content: Content
    model: PretrainedModel
    sampling_params: SamplingParams = SamplingParams()
    max_tokens: int = 0
    stream: bool = False
    logprobs: bool = False


@json_schema_type
@dataclass
class CompletionResponse:
    """Normal completion response."""

    content: Content
    stop_reason: Optional[StopReason] = None
    logprobs: Optional[Dict[str, Any]] = None


@json_schema_type
@dataclass
class CompletionResponseStreamChunk:
    """streamed completion response."""

    text_delta: str
    stop_reason: Optional[StopReason] = None
    logprobs: Optional[Dict[str, Any]] = None


@json_schema_type
@dataclass
class ChatCompletionRequest:
    message: Message
    model: InstructModel
    message_history: List[Message] = None
    sampling_params: SamplingParams = SamplingParams()

    # zero-shot tool definitions as input to the model
    available_tools: List[Union[BuiltinTool, ToolDefinition]] = field(
        default_factory=list
    )

    max_tokens: int = 0
    stream: bool = False
    logprobs: bool = False


@json_schema_type
@dataclass
class ChatCompletionResponse:
    """Normal chat completion response."""

    content: Content

    # note: multiple tool calls can be generated in a single response
    tool_calls: List[ToolCall] = field(default_factory=list)

    stop_reason: Optional[StopReason] = None
    logprobs: Optional[Dict[str, Any]] = None


@json_schema_type
@dataclass
class ChatCompletionResponseStreamChunk:
    """Streamed chat completion response. The actual response is a series of such objects."""

    text_delta: str
    stop_reason: Optional[StopReason] = None
    tool_call: Optional[ToolCall] = None


class Inference(Protocol):

    def post_completion(
        self,
        request: CompletionRequest,
    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...

    def post_chat_completion(
        self,
        request: ChatCompletionRequest,
    ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...


@dataclass
class AgenticSystemCreateRequest:
    instructions: str
    model: InstructModel

    # zero-shot tool definitions as input to the model
    available_tools: List[Union[BuiltinTool, ToolDefinition]] = field(
        default_factory=list
    )

    executable_tools: Set[str] = field(default_factory=set)


@json_schema_type
@dataclass
class AgenticSystemCreateResponse:
    agent_id: str


@json_schema_type
@dataclass
class AgenticSystemExecuteRequest:
    agent_id: str
    messages: List[Message]
    turn_history: List[AgenticSystemTurn] = None
    stream: bool = False


@json_schema_type
@dataclass
class AgenticSystemExecuteResponse:
    """non-stream response from the agentic system."""

    turn: AgenticSystemTurn


@json_schema_type
@dataclass
class AgenticSystemExecuteResponseStreamChunk:
    """Streamed agent execution response."""

    # TODO: make things streamable
    turn: AgenticSystemTurn

    stop_reason: Optional[StopReason] = None


class AgenticSystem(Protocol):

    @webmethod(route="/agentic_system/create")
    def create_agentic_system(
        self,
        request: AgenticSystemCreateRequest,
    ) -> AgenticSystemCreateResponse: ...

    @webmethod(route="/agentic_system/execute")
    def create_agentic_system_execute(
        self,
        request: AgenticSystemExecuteRequest,
    ) -> Union[
        AgenticSystemExecuteResponse, AgenticSystemExecuteResponseStreamChunk
    ]: ...


class LlamaStackEndpoints(Inference, AgenticSystem): ...


if __name__ == "__main__":
    print("Converting the spec to YAML (openapi.yaml) and HTML (openapi.html)")
    spec = Specification(
        LlamaStackEndpoints,
        Options(
            server=Server(url="http://llama.meta.com"),
            info=Info(
                title="Llama Stack specification",
                version="0.1",
                description="This is the llama stack",
            ),
        ),
    )
    with open("openapi.yaml", "w", encoding="utf-8") as fp:
        yaml.dump(spec.get_json(), fp, allow_unicode=True)

    with open("openapi.html", "w") as fp:
        spec.write_html(fp, pretty_print=True)