llama-stack-mirror/simple_view/chat_completion.yml

# Simple bullet form for ease of read and iteration
# Use LLMs to translate this to a OpenAPI spec.

== Schema ==

Message:
  role: str
  text: str
  attachements: List[MediaAttachment]
  eot: bool
  tool_call: bool  # if it's a tool call - builtin or custom or ipython
  # for streaming
  is_complete: bool
  is_header_complete: bool
  metadata: json

MediaAttachment:
  attachement_type: MediaAttachmentType
  data_type: MediaAttachmentDataType
  data: str

MediaAttachmentType: # enum [image, video, audio, text(or file)]
MediaAttachmentDataType:  # enum [raw_bytes, filepath, uri]

Completion:
  id: str
  message: Message
  tokens: List[int]
  logprobs: List[floats]
  finish_reason: str  # Enum (stop, safety, max-length, etc)

Options:
  logprobs: bool
  max_tokens: int
  temperature: float
  top_p: float
  #TODO: Get more options from metagen

TokenUsage:
  input_tokens: int
  output_tokens: int
  total_tokens: int

== Callsite ==

callsite:
  chat_completion/
request_type:
  post
description:
  submit a chat completion request
request:
  messages: List[Message]
  model: str
  options: Options
  n_complections: int
  # TODO: how to handle tooling control if any ?
  # Add `tools` and `tool_choice` --
  # for eg. "auto": use model's guess
  # how to force to use particular tool
  # how to disbale inbuilt tools
  # tools: List[Tool]
  # tool_choice: Any
response:
  id: str
  candidates: List[Completion]  # a list to account for when n_completions > 1
  model_called: str  # info on that model that produced this result
  usage: TokenUsage

# TODO
# callsite:
#   chat_completion_stream/