From ebfa8ad4fbdcdee4c452667f92bb723d8e151324 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 28 Jan 2025 12:27:21 -0800 Subject: [PATCH] Update OpenAPI generator to add param and field documentation --- docs/openapi_generator/generate.py | 13 +- docs/openapi_generator/pyopenapi/generator.py | 34 +- .../openapi_generator/strong_typing/schema.py | 1 + docs/resources/llama-stack-spec.html | 392 ++++++++++-------- docs/resources/llama-stack-spec.yaml | 368 +++++++++------- .../apis/batch_inference/batch_inference.py | 33 +- llama_stack/apis/inference/inference.py | 81 ++-- 7 files changed, 525 insertions(+), 397 deletions(-) diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py index 1a59369cb..48109e5d8 100644 --- a/docs/openapi_generator/generate.py +++ b/docs/openapi_generator/generate.py @@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server # noqa: E402 from .pyopenapi.utility import Specification # noqa: E402 +def str_presenter(dumper, data): + if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith( + "#/components/schemas/" + ): + style = None + else: + style = ">" if "\n" in data or len(data) > 40 else None + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style) + + def main(output_dir: str): output_dir = Path(output_dir) if not output_dir.exists(): @@ -69,7 +79,8 @@ def main(output_dir: str): y.sequence_dash_offset = 2 y.width = 80 y.allow_unicode = True - y.explicit_start = True + y.representer.add_representer(str, str_presenter) + y.dump( spec.get_json(), fp, diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py index 317b895b5..d8e0d81ed 100644 --- a/docs/openapi_generator/pyopenapi/generator.py +++ b/docs/openapi_generator/pyopenapi/generator.py @@ -8,6 +8,7 @@ import collections import hashlib import ipaddress import typing +from dataclasses import field, make_dataclass from typing import Any, Dict, Set, Union from ..strong_typing.core import JsonType @@ -276,6 +277,20 @@ class StatusResponse: examples: List[Any] = dataclasses.field(default_factory=list) +def create_docstring_for_request( + request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str] +) -> str: + """Creates a ReST-style docstring for a dynamically generated request dataclass.""" + lines = ["\n"] # Short description + + # Add parameter documentation in ReST format + for name, type_ in fields: + desc = doc_params.get(name, "") + lines.append(f":param {name}: {desc}") + + return "\n".join(lines) + + class ResponseBuilder: content_builder: ContentBuilder @@ -493,11 +508,24 @@ class Generator: first = next(iter(op.request_params)) request_name, request_type = first - from dataclasses import make_dataclass - op_name = "".join(word.capitalize() for word in op.name.split("_")) request_name = f"{op_name}Request" - request_type = make_dataclass(request_name, op.request_params) + fields = [ + ( + name, + type_, + ) + for name, type_ in op.request_params + ] + request_type = make_dataclass( + request_name, + fields, + namespace={ + "__doc__": create_docstring_for_request( + request_name, fields, doc_params + ) + }, + ) requestBody = RequestBody( content={ diff --git a/docs/openapi_generator/strong_typing/schema.py b/docs/openapi_generator/strong_typing/schema.py index 826efdb4a..f4393041f 100644 --- a/docs/openapi_generator/strong_typing/schema.py +++ b/docs/openapi_generator/strong_typing/schema.py @@ -531,6 +531,7 @@ class JsonSchemaGenerator: # add property docstring if available property_doc = property_docstrings.get(property_name) if property_doc: + # print(output_name, property_doc) property_def.pop("title", None) property_def["description"] = property_doc diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 5998963d2..b720bef21 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -190,7 +190,7 @@ "post": { "responses": { "200": { - "description": "Chat completion response. **OR** SSE-stream of these events.", + "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk", "content": { "text/event-stream": { "schema": { @@ -210,6 +210,7 @@ "tags": [ "Inference" ], + "summary": "Generate a chat completion for the given messages using the specified model.", "parameters": [], "requestBody": { "content": { @@ -227,7 +228,7 @@ "post": { "responses": { "200": { - "description": "Completion response. **OR** streamed completion response.", + "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk", "content": { "text/event-stream": { "schema": { @@ -247,6 +248,7 @@ "tags": [ "Inference" ], + "summary": "Generate a completion for the given content using the specified model.", "parameters": [], "requestBody": { "content": { @@ -485,7 +487,7 @@ "post": { "responses": { "200": { - "description": "OK", + "description": "An array of embeddings, one for each content. Each embedding is a list of floats.", "content": { "application/json": { "schema": { @@ -498,6 +500,7 @@ "tags": [ "Inference" ], + "summary": "Generate embeddings for content pieces using the specified model.", "parameters": [], "requestBody": { "content": { @@ -2372,6 +2375,46 @@ "tool_calls" ] }, + "GrammarResponseFormat": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "grammar", + "default": "grammar" + }, + "bnf": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "bnf" + ] + }, "GreedySamplingStrategy": { "type": "object", "properties": { @@ -2447,6 +2490,46 @@ } } }, + "JsonSchemaResponseFormat": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "json_schema", + "default": "json_schema" + }, + "json_schema": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "type", + "json_schema" + ] + }, "Message": { "oneOf": [ { @@ -2472,6 +2555,23 @@ } } }, + "ResponseFormat": { + "oneOf": [ + { + "$ref": "#/components/schemas/JsonSchemaResponseFormat" + }, + { + "$ref": "#/components/schemas/GrammarResponseFormat" + } + ], + "discriminator": { + "propertyName": "type", + "mapping": { + "json_schema": "#/components/schemas/JsonSchemaResponseFormat", + "grammar": "#/components/schemas/GrammarResponseFormat" + } + } + }, "SamplingParams": { "type": "object", "properties": { @@ -2865,6 +2965,9 @@ "tool_prompt_format": { "$ref": "#/components/schemas/ToolPromptFormat" }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" + }, "logprobs": { "type": "object", "properties": { @@ -2885,16 +2988,49 @@ "BatchChatCompletionResponse": { "type": "object", "properties": { - "completion_message_batch": { + "batch": { "type": "array", "items": { - "$ref": "#/components/schemas/CompletionMessage" + "$ref": "#/components/schemas/ChatCompletionResponse" } } }, "additionalProperties": false, "required": [ - "completion_message_batch" + "batch" + ] + }, + "ChatCompletionResponse": { + "type": "object", + "properties": { + "completion_message": { + "$ref": "#/components/schemas/CompletionMessage" + }, + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + } + } + }, + "additionalProperties": false, + "required": [ + "completion_message" + ] + }, + "TokenLogProbs": { + "type": "object", + "properties": { + "logprobs_by_token": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "additionalProperties": false, + "required": [ + "logprobs_by_token" ] }, "BatchCompletionRequest": { @@ -2912,6 +3048,9 @@ "sampling_params": { "$ref": "#/components/schemas/SamplingParams" }, + "response_format": { + "$ref": "#/components/schemas/ResponseFormat" + }, "logprobs": { "type": "object", "properties": { @@ -2932,18 +3071,41 @@ "BatchCompletionResponse": { "type": "object", "properties": { - "completion_message_batch": { + "batch": { "type": "array", "items": { - "$ref": "#/components/schemas/CompletionMessage" + "$ref": "#/components/schemas/CompletionResponse" } } }, "additionalProperties": false, "required": [ - "completion_message_batch" + "batch" ] }, + "CompletionResponse": { + "type": "object", + "properties": { + "content": { + "type": "string" + }, + "stop_reason": { + "$ref": "#/components/schemas/StopReason" + }, + "logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/TokenLogProbs" + } + } + }, + "additionalProperties": false, + "required": [ + "content", + "stop_reason" + ], + "title": "Completion response." + }, "CancelTrainingJobRequest": { "type": "object", "properties": { @@ -2956,135 +3118,46 @@ "job_uuid" ] }, - "GrammarResponseFormat": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "grammar", - "default": "grammar" - }, - "bnf": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "bnf" - ] - }, - "JsonSchemaResponseFormat": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "json_schema", - "default": "json_schema" - }, - "json_schema": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "json_schema" - ] - }, - "ResponseFormat": { - "oneOf": [ - { - "$ref": "#/components/schemas/JsonSchemaResponseFormat" - }, - { - "$ref": "#/components/schemas/GrammarResponseFormat" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "json_schema": "#/components/schemas/JsonSchemaResponseFormat", - "grammar": "#/components/schemas/GrammarResponseFormat" - } - } - }, "ChatCompletionRequest": { "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use" }, "messages": { "type": "array", "items": { "$ref": "#/components/schemas/Message" - } + }, + "description": "List of messages in the conversation" }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "Parameters to control the sampling strategy" }, "tools": { "type": "array", "items": { "$ref": "#/components/schemas/ToolDefinition" - } + }, + "description": "(Optional) List of tool definitions available to the model" }, "tool_choice": { - "$ref": "#/components/schemas/ToolChoice" + "$ref": "#/components/schemas/ToolChoice", + "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto." }, "tool_prompt_format": { - "$ref": "#/components/schemas/ToolPromptFormat" + "$ref": "#/components/schemas/ToolPromptFormat", + "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model" }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding" }, "stream": { - "type": "boolean" + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." }, "logprobs": { "type": "object", @@ -3094,7 +3167,8 @@ "default": 0 } }, - "additionalProperties": false + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -3103,25 +3177,6 @@ "messages" ] }, - "ChatCompletionResponse": { - "type": "object", - "properties": { - "completion_message": { - "$ref": "#/components/schemas/CompletionMessage" - }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - } - } - }, - "additionalProperties": false, - "required": [ - "completion_message" - ], - "title": "Chat completion response." - }, "ChatCompletionResponseEvent": { "type": "object", "properties": { @@ -3166,8 +3221,7 @@ "additionalProperties": false, "required": [ "event" - ], - "title": "SSE-stream of these events." + ] }, "ContentDelta": { "oneOf": [ @@ -3227,21 +3281,6 @@ "text" ] }, - "TokenLogProbs": { - "type": "object", - "properties": { - "logprobs_by_token": { - "type": "object", - "additionalProperties": { - "type": "number" - } - } - }, - "additionalProperties": false, - "required": [ - "logprobs_by_token" - ] - }, "ToolCallDelta": { "type": "object", "properties": { @@ -3284,19 +3323,24 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use" }, "content": { - "$ref": "#/components/schemas/InterleavedContent" + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content to generate a completion for" }, "sampling_params": { - "$ref": "#/components/schemas/SamplingParams" + "$ref": "#/components/schemas/SamplingParams", + "description": "(Optional) Parameters to control the sampling strategy" }, "response_format": { - "$ref": "#/components/schemas/ResponseFormat" + "$ref": "#/components/schemas/ResponseFormat", + "description": "(Optional) Grammar specification for guided (structured) decoding" }, "stream": { - "type": "boolean" + "type": "boolean", + "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False." }, "logprobs": { "type": "object", @@ -3306,7 +3350,8 @@ "default": 0 } }, - "additionalProperties": false + "additionalProperties": false, + "description": "(Optional) If specified, log probabilities for each token position will be returned." } }, "additionalProperties": false, @@ -3315,29 +3360,6 @@ "content" ] }, - "CompletionResponse": { - "type": "object", - "properties": { - "content": { - "type": "string" - }, - "stop_reason": { - "$ref": "#/components/schemas/StopReason" - }, - "logprobs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/TokenLogProbs" - } - } - }, - "additionalProperties": false, - "required": [ - "content", - "stop_reason" - ], - "title": "Completion response." - }, "CompletionResponseStreamChunk": { "type": "object", "properties": { @@ -4241,13 +4263,15 @@ "type": "object", "properties": { "model_id": { - "type": "string" + "type": "string", + "description": "The identifier of the model to use" }, "contents": { "type": "array", "items": { "$ref": "#/components/schemas/InterleavedContent" - } + }, + "description": "List of contents to generate embeddings for. Note that content can be multimodal." } }, "additionalProperties": false, @@ -7863,7 +7887,7 @@ }, { "name": "ChatCompletionResponse", - "description": "Chat completion response." + "description": "" }, { "name": "ChatCompletionResponseEvent", @@ -7875,7 +7899,7 @@ }, { "name": "ChatCompletionResponseStreamChunk", - "description": "SSE-stream of these events." + "description": "" }, { "name": "Checkpoint", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 1d7c4f113..353d99d00 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -1,11 +1,12 @@ ---- openapi: 3.1.0 info: title: Llama Stack Specification version: v1 - description: "This is the specification of the Llama Stack that provides\n \ - \ a set of endpoints and their corresponding interfaces that are tailored - to\n best leverage Llama Models." + description: >- + This is the specification of the Llama Stack that provides + a set of endpoints and their corresponding interfaces that are + tailored to + best leverage Llama Models. servers: - url: http://any-hosted-llama-stack.com paths: @@ -108,7 +109,9 @@ paths: post: responses: '200': - description: Chat completion response. **OR** SSE-stream of these events. + description: >- + If stream=False, returns a ChatCompletionResponse with the full completion. + If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk content: text/event-stream: schema: @@ -117,6 +120,8 @@ paths: - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk' tags: - Inference + summary: >- + Generate a chat completion for the given messages using the specified model. parameters: [] requestBody: content: @@ -128,7 +133,9 @@ paths: post: responses: '200': - description: Completion response. **OR** streamed completion response. + description: >- + If stream=False, returns a CompletionResponse with the full completion. + If stream=True, returns an SSE event stream of CompletionResponseStreamChunk content: text/event-stream: schema: @@ -137,6 +144,8 @@ paths: - $ref: '#/components/schemas/CompletionResponseStreamChunk' tags: - Inference + summary: >- + Generate a completion for the given content using the specified model. parameters: [] requestBody: content: @@ -189,8 +198,9 @@ paths: post: responses: '200': - description: A single turn in an interaction with an Agentic System. **OR** - streamed agent turn completion response. + description: >- + A single turn in an interaction with an Agentic System. **OR** streamed + agent turn completion response. content: text/event-stream: schema: @@ -279,13 +289,17 @@ paths: post: responses: '200': - description: OK + description: >- + An array of embeddings, one for each content. Each embedding is a list + of floats. content: application/json: schema: $ref: '#/components/schemas/EmbeddingsResponse' tags: - Inference + summary: >- + Generate embeddings for content pieces using the specified model. parameters: [] requestBody: content: @@ -709,7 +723,8 @@ paths: description: OK tags: - ToolRuntime - summary: Index documents so they can be used by the RAG system + summary: >- + Index documents so they can be used by the RAG system parameters: [] requestBody: content: @@ -1109,7 +1124,8 @@ paths: $ref: '#/components/schemas/RAGQueryResult' tags: - ToolRuntime - summary: Query the RAG system for context; typically invoked by the agent + summary: >- + Query the RAG system for context; typically invoked by the agent parameters: [] requestBody: content: @@ -1341,7 +1357,8 @@ paths: tags: - Inspect parameters: [] -jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema +jsonSchemaDialect: >- + https://json-schema.org/draft/2020-12/schema components: schemas: AppendRowsRequest: @@ -1393,6 +1410,27 @@ components: - content - stop_reason - tool_calls + GrammarResponseFormat: + type: object + properties: + type: + type: string + const: grammar + default: grammar + bnf: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - type + - bnf GreedySamplingStrategy: type: object properties: @@ -1439,6 +1477,27 @@ components: mapping: image: '#/components/schemas/ImageContentItem' text: '#/components/schemas/TextContentItem' + JsonSchemaResponseFormat: + type: object + properties: + type: + type: string + const: json_schema + default: json_schema + json_schema: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - type + - json_schema Message: oneOf: - $ref: '#/components/schemas/UserMessage' @@ -1452,6 +1511,15 @@ components: system: '#/components/schemas/SystemMessage' tool: '#/components/schemas/ToolResponseMessage' assistant: '#/components/schemas/CompletionMessage' + ResponseFormat: + oneOf: + - $ref: '#/components/schemas/JsonSchemaResponseFormat' + - $ref: '#/components/schemas/GrammarResponseFormat' + discriminator: + propertyName: type + mapping: + json_schema: '#/components/schemas/JsonSchemaResponseFormat' + grammar: '#/components/schemas/GrammarResponseFormat' SamplingParams: type: object properties: @@ -1594,16 +1662,28 @@ components: - json - function_tag - python_list - title: This Enum refers to the prompt format for calling custom / zero shot - tools - description: "`json` --\n Refers to the json format for calling tools.\n\ - \ The json format takes the form like\n {\n \"type\": \"function\"\ - ,\n \"function\" : {\n \"name\": \"function_name\",\n \ - \ \"description\": \"function_description\",\n \"parameters\"\ - : {...}\n }\n }\n\n`function_tag` --\n This is an example of - how you could define\n your own user defined format for making tool calls.\n\ - \ The function_tag format looks like this,\n (parameters)\n - \nThe detailed prompts for each of these formats are added to llama cli" + title: >- + This Enum refers to the prompt format for calling custom / zero shot tools + description: >- + `json` -- + Refers to the json format for calling tools. + The json format takes the form like + { + "type": "function", + "function" : { + "name": "function_name", + "description": "function_description", + "parameters": {...} + } + } + + `function_tag` -- + This is an example of how you could define + your own user defined format for making tool calls. + The function_tag format looks like this, + (parameters) + + The detailed prompts for each of these formats are added to llama cli ToolResponseMessage: type: object properties: @@ -1697,6 +1777,8 @@ components: $ref: '#/components/schemas/ToolChoice' tool_prompt_format: $ref: '#/components/schemas/ToolPromptFormat' + response_format: + $ref: '#/components/schemas/ResponseFormat' logprobs: type: object properties: @@ -1711,13 +1793,35 @@ components: BatchChatCompletionResponse: type: object properties: - completion_message_batch: + batch: type: array items: - $ref: '#/components/schemas/CompletionMessage' + $ref: '#/components/schemas/ChatCompletionResponse' additionalProperties: false required: - - completion_message_batch + - batch + ChatCompletionResponse: + type: object + properties: + completion_message: + $ref: '#/components/schemas/CompletionMessage' + logprobs: + type: array + items: + $ref: '#/components/schemas/TokenLogProbs' + additionalProperties: false + required: + - completion_message + TokenLogProbs: + type: object + properties: + logprobs_by_token: + type: object + additionalProperties: + type: number + additionalProperties: false + required: + - logprobs_by_token BatchCompletionRequest: type: object properties: @@ -1729,6 +1833,8 @@ components: $ref: '#/components/schemas/InterleavedContent' sampling_params: $ref: '#/components/schemas/SamplingParams' + response_format: + $ref: '#/components/schemas/ResponseFormat' logprobs: type: object properties: @@ -1743,13 +1849,29 @@ components: BatchCompletionResponse: type: object properties: - completion_message_batch: + batch: type: array items: - $ref: '#/components/schemas/CompletionMessage' + $ref: '#/components/schemas/CompletionResponse' additionalProperties: false required: - - completion_message_batch + - batch + CompletionResponse: + type: object + properties: + content: + type: string + stop_reason: + $ref: '#/components/schemas/StopReason' + logprobs: + type: array + items: + $ref: '#/components/schemas/TokenLogProbs' + additionalProperties: false + required: + - content + - stop_reason + title: Completion response. CancelTrainingJobRequest: type: object properties: @@ -1758,80 +1880,45 @@ components: additionalProperties: false required: - job_uuid - GrammarResponseFormat: - type: object - properties: - type: - type: string - const: grammar - default: grammar - bnf: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - type - - bnf - JsonSchemaResponseFormat: - type: object - properties: - type: - type: string - const: json_schema - default: json_schema - json_schema: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - additionalProperties: false - required: - - type - - json_schema - ResponseFormat: - oneOf: - - $ref: '#/components/schemas/JsonSchemaResponseFormat' - - $ref: '#/components/schemas/GrammarResponseFormat' - discriminator: - propertyName: type - mapping: - json_schema: '#/components/schemas/JsonSchemaResponseFormat' - grammar: '#/components/schemas/GrammarResponseFormat' ChatCompletionRequest: type: object properties: model_id: type: string + description: The identifier of the model to use messages: type: array items: $ref: '#/components/schemas/Message' + description: List of messages in the conversation sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + Parameters to control the sampling strategy tools: type: array items: $ref: '#/components/schemas/ToolDefinition' + description: >- + (Optional) List of tool definitions available to the model tool_choice: $ref: '#/components/schemas/ToolChoice' + description: >- + (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. tool_prompt_format: $ref: '#/components/schemas/ToolPromptFormat' + description: >- + (Optional) Specifies how tool definitions are formatted when presenting + to the model response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding stream: type: boolean + description: >- + (Optional) If True, generate an SSE event stream of the response. Defaults + to False. logprobs: type: object properties: @@ -1839,23 +1926,13 @@ components: type: integer default: 0 additionalProperties: false + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id - messages - ChatCompletionResponse: - type: object - properties: - completion_message: - $ref: '#/components/schemas/CompletionMessage' - logprobs: - type: array - items: - $ref: '#/components/schemas/TokenLogProbs' - additionalProperties: false - required: - - completion_message - title: Chat completion response. ChatCompletionResponseEvent: type: object properties: @@ -1888,7 +1965,6 @@ components: additionalProperties: false required: - event - title: SSE-stream of these events. ContentDelta: oneOf: - $ref: '#/components/schemas/TextDelta' @@ -1927,16 +2003,6 @@ components: required: - type - text - TokenLogProbs: - type: object - properties: - logprobs_by_token: - type: object - additionalProperties: - type: number - additionalProperties: false - required: - - logprobs_by_token ToolCallDelta: type: object properties: @@ -1967,14 +2033,23 @@ components: properties: model_id: type: string + description: The identifier of the model to use content: $ref: '#/components/schemas/InterleavedContent' + description: The content to generate a completion for sampling_params: $ref: '#/components/schemas/SamplingParams' + description: >- + (Optional) Parameters to control the sampling strategy response_format: $ref: '#/components/schemas/ResponseFormat' + description: >- + (Optional) Grammar specification for guided (structured) decoding stream: type: boolean + description: >- + (Optional) If True, generate an SSE event stream of the response. Defaults + to False. logprobs: type: object properties: @@ -1982,26 +2057,13 @@ components: type: integer default: 0 additionalProperties: false + description: >- + (Optional) If specified, log probabilities for each token position will + be returned. additionalProperties: false required: - model_id - content - CompletionResponse: - type: object - properties: - content: - type: string - stop_reason: - $ref: '#/components/schemas/StopReason' - logprobs: - type: array - items: - $ref: '#/components/schemas/TokenLogProbs' - additionalProperties: false - required: - - content - - stop_reason - title: Completion response. CompletionResponseStreamChunk: type: object properties: @@ -2558,7 +2620,8 @@ components: - output_message - output_attachments - started_at - title: A single turn in an interaction with an Agentic System. + title: >- + A single turn in an interaction with an Agentic System. ViolationLevel: type: string enum: @@ -2570,10 +2633,14 @@ components: properties: model_id: type: string + description: The identifier of the model to use contents: type: array items: $ref: '#/components/schemas/InterleavedContent' + description: >- + List of contents to generate embeddings for. Note that content can be + multimodal. additionalProperties: false required: - model_id @@ -2845,7 +2912,8 @@ components: - session_name - turns - started_at - title: A single session of an interaction with an Agentic System. + title: >- + A single session of an interaction with an Agentic System. AgentStepResponse: type: object properties: @@ -3194,7 +3262,8 @@ components: - provider_resource_id - provider_id - type - title: A safety shield resource that can be used to check content + title: >- + A safety shield resource that can be used to check content Span: type: object properties: @@ -4684,8 +4753,9 @@ components: additionalProperties: false required: - synthetic_data - title: Response from the synthetic data generation. Batch of (prompt, response, - score) tuples that pass the threshold. + title: >- + Response from the synthetic data generation. Batch of (prompt, response, score) + tuples that pass the threshold. VersionInfo: type: object properties: @@ -4763,13 +4833,13 @@ tags: - name: ChatCompletionRequest description: '' - name: ChatCompletionResponse - description: Chat completion response. + description: '' - name: ChatCompletionResponseEvent description: Chat completion response event. - name: ChatCompletionResponseEventType description: '' - name: ChatCompletionResponseStreamChunk - description: SSE-stream of these events. + description: '' - name: Checkpoint description: Checkpoint created during training runs - name: CompletionInputType @@ -4998,9 +5068,11 @@ tags: - name: ScoringResult description: '' - name: Session - description: A single session of an interaction with an Agentic System. + description: >- + A single session of an interaction with an Agentic System. - name: Shield - description: A safety shield resource that can be used to check content + description: >- + A safety shield resource that can be used to check content - name: ShieldCallStep description: '' - name: Shields @@ -5028,8 +5100,9 @@ tags: description: '' - name: SyntheticDataGeneration (Coming Soon) - name: SyntheticDataGenerationResponse - description: Response from the synthetic data generation. Batch of (prompt, response, - score) tuples that pass the threshold. + description: >- + Response from the synthetic data generation. Batch of (prompt, response, score) + tuples that pass the threshold. - name: SystemMessage description: '' - name: Telemetry @@ -5067,15 +5140,29 @@ tags: - name: ToolParameter description: '' - name: ToolPromptFormat - description: "This Enum refers to the prompt format for calling custom / zero - shot tools\n\n`json` --\n Refers to the json format for calling tools.\n\ - \ The json format takes the form like\n {\n \"type\": \"function\"\ - ,\n \"function\" : {\n \"name\": \"function_name\",\n \ - \ \"description\": \"function_description\",\n \"parameters\"\ - : {...}\n }\n }\n\n`function_tag` --\n This is an example of how - you could define\n your own user defined format for making tool calls.\n\ - \ The function_tag format looks like this,\n (parameters)\n - \nThe detailed prompts for each of these formats are added to llama cli" + description: >- + This Enum refers to the prompt format for calling custom / zero shot tools + + + `json` -- + Refers to the json format for calling tools. + The json format takes the form like + { + "type": "function", + "function" : { + "name": "function_name", + "description": "function_description", + "parameters": {...} + } + } + + `function_tag` -- + This is an example of how you could define + your own user defined format for making tool calls. + The function_tag format looks like this, + (parameters) + + The detailed prompts for each of these formats are added to llama cli - name: ToolResponse description: '' - name: ToolResponseMessage @@ -5090,7 +5177,8 @@ tags: - name: TrainingConfig description: '' - name: Turn - description: A single turn in an interaction with an Agentic System. + description: >- + A single turn in an interaction with an Agentic System. - name: URL description: '' - name: UnionType diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py index ca5ba059f..413c81c5a 100644 --- a/llama_stack/apis/batch_inference/batch_inference.py +++ b/llama_stack/apis/batch_inference/batch_inference.py @@ -7,13 +7,15 @@ from typing import List, Optional, Protocol, runtime_checkable from llama_models.schema_utils import json_schema_type, webmethod -from pydantic import BaseModel, Field +from pydantic import BaseModel from llama_stack.apis.inference import ( - CompletionMessage, + ChatCompletionResponse, + CompletionResponse, InterleavedContent, LogProbConfig, Message, + ResponseFormat, SamplingParams, ToolChoice, ToolDefinition, @@ -21,35 +23,14 @@ from llama_stack.apis.inference import ( ) -@json_schema_type -class BatchCompletionRequest(BaseModel): - model: str - content_batch: List[InterleavedContent] - sampling_params: Optional[SamplingParams] = SamplingParams() - logprobs: Optional[LogProbConfig] = None - - @json_schema_type class BatchCompletionResponse(BaseModel): - completion_message_batch: List[CompletionMessage] - - -@json_schema_type -class BatchChatCompletionRequest(BaseModel): - model: str - messages_batch: List[List[Message]] - sampling_params: Optional[SamplingParams] = SamplingParams() - - # zero-shot tool definitions as input to the model - tools: Optional[List[ToolDefinition]] = Field(default_factory=list) - tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto) - tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None) - logprobs: Optional[LogProbConfig] = None + batch: List[CompletionResponse] @json_schema_type class BatchChatCompletionResponse(BaseModel): - completion_message_batch: List[CompletionMessage] + batch: List[ChatCompletionResponse] @runtime_checkable @@ -60,6 +41,7 @@ class BatchInference(Protocol): model: str, content_batch: List[InterleavedContent], sampling_params: Optional[SamplingParams] = SamplingParams(), + response_format: Optional[ResponseFormat] = None, logprobs: Optional[LogProbConfig] = None, ) -> BatchCompletionResponse: ... @@ -73,5 +55,6 @@ class BatchInference(Protocol): tools: Optional[List[ToolDefinition]] = list, tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_prompt_format: Optional[ToolPromptFormat] = None, + response_format: Optional[ResponseFormat] = None, logprobs: Optional[LogProbConfig] = None, ) -> BatchChatCompletionResponse: ... diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 871f1f633..36f385eb2 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -186,7 +186,6 @@ ResponseFormat = register_schema( ) -@json_schema_type class CompletionRequest(BaseModel): model: str content: InterleavedContent @@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel): logprobs: Optional[List[TokenLogProbs]] = None -@json_schema_type -class BatchCompletionRequest(BaseModel): - model: str - content_batch: List[InterleavedContent] - sampling_params: Optional[SamplingParams] = SamplingParams() - response_format: Optional[ResponseFormat] = None - logprobs: Optional[LogProbConfig] = None - - -@json_schema_type -class BatchCompletionResponse(BaseModel): - """Batch completion response.""" - - batch: List[CompletionResponse] - - -@json_schema_type class ChatCompletionRequest(BaseModel): model: str messages: List[Message] @@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel): @json_schema_type class ChatCompletionResponseStreamChunk(BaseModel): - """SSE-stream of these events.""" - event: ChatCompletionResponseEvent @json_schema_type class ChatCompletionResponse(BaseModel): - """Chat completion response.""" - completion_message: CompletionMessage logprobs: Optional[List[TokenLogProbs]] = None -@json_schema_type -class BatchChatCompletionRequest(BaseModel): - model: str - messages_batch: List[List[Message]] - sampling_params: Optional[SamplingParams] = SamplingParams() - - # zero-shot tool definitions as input to the model - tools: Optional[List[ToolDefinition]] = Field(default_factory=list) - tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto) - tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None) - logprobs: Optional[LogProbConfig] = None - - -@json_schema_type -class BatchChatCompletionResponse(BaseModel): - batch: List[ChatCompletionResponse] - - @json_schema_type class EmbeddingsResponse(BaseModel): embeddings: List[List[float]] @@ -303,7 +263,19 @@ class Inference(Protocol): response_format: Optional[ResponseFormat] = None, stream: Optional[bool] = False, logprobs: Optional[LogProbConfig] = None, - ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ... + ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: + """Generate a completion for the given content using the specified model. + + :param model_id: The identifier of the model to use + :param content: The content to generate a completion for + :param sampling_params: (Optional) Parameters to control the sampling strategy + :param response_format: (Optional) Grammar specification for guided (structured) decoding + :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False. + :param logprobs: (Optional) If specified, log probabilities for each token position will be returned. + :returns: If stream=False, returns a CompletionResponse with the full completion. + If stream=True, returns an SSE event stream of CompletionResponseStreamChunk + """ + ... @webmethod(route="/inference/chat-completion", method="POST") async def chat_completion( @@ -311,7 +283,6 @@ class Inference(Protocol): model_id: str, messages: List[Message], sampling_params: Optional[SamplingParams] = SamplingParams(), - # zero-shot tool definitions as input to the model tools: Optional[List[ToolDefinition]] = None, tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_prompt_format: Optional[ToolPromptFormat] = None, @@ -320,11 +291,33 @@ class Inference(Protocol): logprobs: Optional[LogProbConfig] = None, ) -> Union[ ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk] - ]: ... + ]: + """Generate a chat completion for the given messages using the specified model. + + :param model_id: The identifier of the model to use + :param messages: List of messages in the conversation + :param sampling_params: Parameters to control the sampling strategy + :param tools: (Optional) List of tool definitions available to the model + :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto. + :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model + :param response_format: (Optional) Grammar specification for guided (structured) decoding + :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False. + :param logprobs: (Optional) If specified, log probabilities for each token position will be returned. + :returns: If stream=False, returns a ChatCompletionResponse with the full completion. + If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk + """ + ... @webmethod(route="/inference/embeddings", method="POST") async def embeddings( self, model_id: str, contents: List[InterleavedContent], - ) -> EmbeddingsResponse: ... + ) -> EmbeddingsResponse: + """Generate embeddings for content pieces using the specified model. + + :param model_id: The identifier of the model to use + :param contents: List of contents to generate embeddings for. Note that content can be multimodal. + :returns: An array of embeddings, one for each content. Each embedding is a list of floats. + """ + ...