From aaf5036235ab03bef20dcdeb432fc82f0e4a288e Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 10 Oct 2025 06:22:59 -0700 Subject: [PATCH] feat(responses): add usage types to inference and responses APIs (#3764) ## Summary Adds OpenAI-compatible usage tracking types to enable reporting token consumption for both streaming and non-streaming responses. ## Type Definitions **Chat Completion Usage** (inference API): ```python class OpenAIChatCompletionUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int prompt_tokens_details: OpenAIChatCompletionUsagePromptTokensDetails | None completion_tokens_details: OpenAIChatCompletionUsageCompletionTokensDetails | None ``` **Response Usage** (responses API): ```python class OpenAIResponseUsage(BaseModel): input_tokens: int output_tokens: int total_tokens: int input_tokens_details: OpenAIResponseUsageInputTokensDetails | None output_tokens_details: OpenAIResponseUsageOutputTokensDetails | None ``` This matches OpenAI's usage reporting format and enables PR #3766 to implement usage tracking in streaming responses. Co-authored-by: Claude --- docs/static/deprecated-llama-stack-spec.html | 120 +++++++++++++++++++ docs/static/deprecated-llama-stack-spec.yaml | 103 ++++++++++++++++ docs/static/llama-stack-spec.html | 120 +++++++++++++++++++ docs/static/llama-stack-spec.yaml | 103 ++++++++++++++++ docs/static/stainless-llama-stack-spec.html | 120 +++++++++++++++++++ docs/static/stainless-llama-stack-spec.yaml | 103 ++++++++++++++++ llama_stack/apis/agents/openai_responses.py | 38 ++++++ llama_stack/apis/inference/inference.py | 40 +++++++ 8 files changed, 747 insertions(+) diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index 04a3dca9b..463837141 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -6781,6 +6781,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -6983,6 +6987,55 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, + "OpenAIChatCompletionUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "Number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (prompt + completion)" + }, + "prompt_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsagePromptTokensDetails", + "description": "Token details for prompt tokens in OpenAI chat completion usage." + }, + "completion_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsageCompletionTokensDetails", + "description": "Token details for output tokens in OpenAI chat completion usage." + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "OpenAIChatCompletionUsage", + "description": "Usage information for OpenAI chat completion." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -7745,6 +7798,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" } }, "additionalProperties": false, @@ -7785,6 +7842,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information (typically included in final chunk with stream_options)" } }, "additionalProperties": false, @@ -7882,6 +7943,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -9096,6 +9161,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" + }, "input": { "type": "array", "items": { @@ -9541,6 +9610,53 @@ "title": "OpenAIResponseText", "description": "Text response configuration for OpenAI responses." }, + "OpenAIResponseUsage": { + "type": "object", + "properties": { + "input_tokens": { + "type": "integer", + "description": "Number of tokens in the input" + }, + "output_tokens": { + "type": "integer", + "description": "Number of tokens in the output" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (input + output)" + }, + "input_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of input token usage" + }, + "output_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of output token usage" + } + }, + "additionalProperties": false, + "required": [ + "input_tokens", + "output_tokens", + "total_tokens" + ], + "title": "OpenAIResponseUsage", + "description": "Usage information for OpenAI response." + }, "ResponseShieldSpec": { "type": "object", "properties": { @@ -9983,6 +10099,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" } }, "additionalProperties": false, diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 1a215b877..e4871e12a 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -4999,6 +4999,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -5165,6 +5169,49 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. + OpenAIChatCompletionUsage: + type: object + properties: + prompt_tokens: + type: integer + description: Number of tokens in the prompt + completion_tokens: + type: integer + description: Number of tokens in the completion + total_tokens: + type: integer + description: Total tokens used (prompt + completion) + prompt_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + title: >- + OpenAIChatCompletionUsagePromptTokensDetails + description: >- + Token details for prompt tokens in OpenAI chat completion usage. + completion_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + title: >- + OpenAIChatCompletionUsageCompletionTokensDetails + description: >- + Token details for output tokens in OpenAI chat completion usage. + additionalProperties: false + required: + - prompt_tokens + - completion_tokens + - total_tokens + title: OpenAIChatCompletionUsage + description: >- + Usage information for OpenAI chat completion. OpenAIChoice: type: object properties: @@ -5696,6 +5743,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion additionalProperties: false required: - id @@ -5731,6 +5782,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information (typically included in final chunk with stream_options) additionalProperties: false required: - id @@ -5811,6 +5866,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -6747,6 +6806,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response input: type: array items: @@ -7095,6 +7158,42 @@ components: title: OpenAIResponseText description: >- Text response configuration for OpenAI responses. + OpenAIResponseUsage: + type: object + properties: + input_tokens: + type: integer + description: Number of tokens in the input + output_tokens: + type: integer + description: Number of tokens in the output + total_tokens: + type: integer + description: Total tokens used (input + output) + input_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + description: Detailed breakdown of input token usage + output_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + description: Detailed breakdown of output token usage + additionalProperties: false + required: + - input_tokens + - output_tokens + - total_tokens + title: OpenAIResponseUsage + description: Usage information for OpenAI response. ResponseShieldSpec: type: object properties: @@ -7421,6 +7520,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response additionalProperties: false required: - created_at diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 9cd526176..8c363e61b 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -4277,6 +4277,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -4479,6 +4483,55 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, + "OpenAIChatCompletionUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "Number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (prompt + completion)" + }, + "prompt_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsagePromptTokensDetails", + "description": "Token details for prompt tokens in OpenAI chat completion usage." + }, + "completion_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsageCompletionTokensDetails", + "description": "Token details for output tokens in OpenAI chat completion usage." + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "OpenAIChatCompletionUsage", + "description": "Usage information for OpenAI chat completion." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -5241,6 +5294,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" } }, "additionalProperties": false, @@ -5281,6 +5338,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information (typically included in final chunk with stream_options)" } }, "additionalProperties": false, @@ -5378,6 +5439,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -7503,6 +7568,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" + }, "input": { "type": "array", "items": { @@ -7636,6 +7705,53 @@ "title": "OpenAIResponseText", "description": "Text response configuration for OpenAI responses." }, + "OpenAIResponseUsage": { + "type": "object", + "properties": { + "input_tokens": { + "type": "integer", + "description": "Number of tokens in the input" + }, + "output_tokens": { + "type": "integer", + "description": "Number of tokens in the output" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (input + output)" + }, + "input_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of input token usage" + }, + "output_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of output token usage" + } + }, + "additionalProperties": false, + "required": [ + "input_tokens", + "output_tokens", + "total_tokens" + ], + "title": "OpenAIResponseUsage", + "description": "Usage information for OpenAI response." + }, "ResponseShieldSpec": { "type": "object", "properties": { @@ -8078,6 +8194,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" } }, "additionalProperties": false, diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 66ce8e38a..bc587f939 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -3248,6 +3248,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -3414,6 +3418,49 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. + OpenAIChatCompletionUsage: + type: object + properties: + prompt_tokens: + type: integer + description: Number of tokens in the prompt + completion_tokens: + type: integer + description: Number of tokens in the completion + total_tokens: + type: integer + description: Total tokens used (prompt + completion) + prompt_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + title: >- + OpenAIChatCompletionUsagePromptTokensDetails + description: >- + Token details for prompt tokens in OpenAI chat completion usage. + completion_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + title: >- + OpenAIChatCompletionUsageCompletionTokensDetails + description: >- + Token details for output tokens in OpenAI chat completion usage. + additionalProperties: false + required: + - prompt_tokens + - completion_tokens + - total_tokens + title: OpenAIChatCompletionUsage + description: >- + Usage information for OpenAI chat completion. OpenAIChoice: type: object properties: @@ -3945,6 +3992,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion additionalProperties: false required: - id @@ -3980,6 +4031,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information (typically included in final chunk with stream_options) additionalProperties: false required: - id @@ -4060,6 +4115,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -5700,6 +5759,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response input: type: array items: @@ -5791,6 +5854,42 @@ components: title: OpenAIResponseText description: >- Text response configuration for OpenAI responses. + OpenAIResponseUsage: + type: object + properties: + input_tokens: + type: integer + description: Number of tokens in the input + output_tokens: + type: integer + description: Number of tokens in the output + total_tokens: + type: integer + description: Total tokens used (input + output) + input_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + description: Detailed breakdown of input token usage + output_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + description: Detailed breakdown of output token usage + additionalProperties: false + required: + - input_tokens + - output_tokens + - total_tokens + title: OpenAIResponseUsage + description: Usage information for OpenAI response. ResponseShieldSpec: type: object properties: @@ -6117,6 +6216,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response additionalProperties: false required: - created_at diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 3478d3338..405f64038 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -6286,6 +6286,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -6488,6 +6492,55 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, + "OpenAIChatCompletionUsage": { + "type": "object", + "properties": { + "prompt_tokens": { + "type": "integer", + "description": "Number of tokens in the prompt" + }, + "completion_tokens": { + "type": "integer", + "description": "Number of tokens in the completion" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (prompt + completion)" + }, + "prompt_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsagePromptTokensDetails", + "description": "Token details for prompt tokens in OpenAI chat completion usage." + }, + "completion_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "title": "OpenAIChatCompletionUsageCompletionTokensDetails", + "description": "Token details for output tokens in OpenAI chat completion usage." + } + }, + "additionalProperties": false, + "required": [ + "prompt_tokens", + "completion_tokens", + "total_tokens" + ], + "title": "OpenAIChatCompletionUsage", + "description": "Usage information for OpenAI chat completion." + }, "OpenAIChoice": { "type": "object", "properties": { @@ -7250,6 +7303,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" } }, "additionalProperties": false, @@ -7290,6 +7347,10 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information (typically included in final chunk with stream_options)" } }, "additionalProperties": false, @@ -7387,6 +7448,10 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, + "usage": { + "$ref": "#/components/schemas/OpenAIChatCompletionUsage", + "description": "Token usage information for the completion" + }, "input_messages": { "type": "array", "items": { @@ -9512,6 +9577,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" + }, "input": { "type": "array", "items": { @@ -9645,6 +9714,53 @@ "title": "OpenAIResponseText", "description": "Text response configuration for OpenAI responses." }, + "OpenAIResponseUsage": { + "type": "object", + "properties": { + "input_tokens": { + "type": "integer", + "description": "Number of tokens in the input" + }, + "output_tokens": { + "type": "integer", + "description": "Number of tokens in the output" + }, + "total_tokens": { + "type": "integer", + "description": "Total tokens used (input + output)" + }, + "input_tokens_details": { + "type": "object", + "properties": { + "cached_tokens": { + "type": "integer", + "description": "Number of tokens retrieved from cache" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of input token usage" + }, + "output_tokens_details": { + "type": "object", + "properties": { + "reasoning_tokens": { + "type": "integer", + "description": "Number of tokens used for reasoning (o1/o3 models)" + } + }, + "additionalProperties": false, + "description": "Detailed breakdown of output token usage" + } + }, + "additionalProperties": false, + "required": [ + "input_tokens", + "output_tokens", + "total_tokens" + ], + "title": "OpenAIResponseUsage", + "description": "Usage information for OpenAI response." + }, "ResponseShieldSpec": { "type": "object", "properties": { @@ -10087,6 +10203,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "usage": { + "$ref": "#/components/schemas/OpenAIResponseUsage", + "description": "(Optional) Token usage information for the response" } }, "additionalProperties": false, diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 6c04542bf..182e7363d 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -4693,6 +4693,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -4859,6 +4863,49 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. + OpenAIChatCompletionUsage: + type: object + properties: + prompt_tokens: + type: integer + description: Number of tokens in the prompt + completion_tokens: + type: integer + description: Number of tokens in the completion + total_tokens: + type: integer + description: Total tokens used (prompt + completion) + prompt_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + title: >- + OpenAIChatCompletionUsagePromptTokensDetails + description: >- + Token details for prompt tokens in OpenAI chat completion usage. + completion_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + title: >- + OpenAIChatCompletionUsageCompletionTokensDetails + description: >- + Token details for output tokens in OpenAI chat completion usage. + additionalProperties: false + required: + - prompt_tokens + - completion_tokens + - total_tokens + title: OpenAIChatCompletionUsage + description: >- + Usage information for OpenAI chat completion. OpenAIChoice: type: object properties: @@ -5390,6 +5437,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion additionalProperties: false required: - id @@ -5425,6 +5476,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information (typically included in final chunk with stream_options) additionalProperties: false required: - id @@ -5505,6 +5560,10 @@ components: type: string description: >- The model that was used to generate the chat completion + usage: + $ref: '#/components/schemas/OpenAIChatCompletionUsage' + description: >- + Token usage information for the completion input_messages: type: array items: @@ -7145,6 +7204,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response input: type: array items: @@ -7236,6 +7299,42 @@ components: title: OpenAIResponseText description: >- Text response configuration for OpenAI responses. + OpenAIResponseUsage: + type: object + properties: + input_tokens: + type: integer + description: Number of tokens in the input + output_tokens: + type: integer + description: Number of tokens in the output + total_tokens: + type: integer + description: Total tokens used (input + output) + input_tokens_details: + type: object + properties: + cached_tokens: + type: integer + description: Number of tokens retrieved from cache + additionalProperties: false + description: Detailed breakdown of input token usage + output_tokens_details: + type: object + properties: + reasoning_tokens: + type: integer + description: >- + Number of tokens used for reasoning (o1/o3 models) + additionalProperties: false + description: Detailed breakdown of output token usage + additionalProperties: false + required: + - input_tokens + - output_tokens + - total_tokens + title: OpenAIResponseUsage + description: Usage information for OpenAI response. ResponseShieldSpec: type: object properties: @@ -7562,6 +7661,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + usage: + $ref: '#/components/schemas/OpenAIResponseUsage' + description: >- + (Optional) Token usage information for the response additionalProperties: false required: - created_at diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 0f3511ea3..3fd08362c 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -346,6 +346,42 @@ class OpenAIResponseText(BaseModel): format: OpenAIResponseTextFormat | None = None +class OpenAIResponseUsageOutputTokensDetails(BaseModel): + """Token details for output tokens in OpenAI response usage. + + :param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models) + """ + + reasoning_tokens: int | None = None + + +class OpenAIResponseUsageInputTokensDetails(BaseModel): + """Token details for input tokens in OpenAI response usage. + + :param cached_tokens: Number of tokens retrieved from cache + """ + + cached_tokens: int | None = None + + +@json_schema_type +class OpenAIResponseUsage(BaseModel): + """Usage information for OpenAI response. + + :param input_tokens: Number of tokens in the input + :param output_tokens: Number of tokens in the output + :param total_tokens: Total tokens used (input + output) + :param input_tokens_details: Detailed breakdown of input token usage + :param output_tokens_details: Detailed breakdown of output token usage + """ + + input_tokens: int + output_tokens: int + total_tokens: int + input_tokens_details: OpenAIResponseUsageInputTokensDetails | None = None + output_tokens_details: OpenAIResponseUsageOutputTokensDetails | None = None + + @json_schema_type class OpenAIResponseObject(BaseModel): """Complete OpenAI response object containing generation results and metadata. @@ -363,6 +399,7 @@ class OpenAIResponseObject(BaseModel): :param text: Text formatting configuration for the response :param top_p: (Optional) Nucleus sampling parameter used for generation :param truncation: (Optional) Truncation strategy applied to the response + :param usage: (Optional) Token usage information for the response """ created_at: int @@ -380,6 +417,7 @@ class OpenAIResponseObject(BaseModel): text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) top_p: float | None = None truncation: str | None = None + usage: OpenAIResponseUsage | None = None @json_schema_type diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 62a988ea6..375ddb231 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -816,6 +816,42 @@ class OpenAIChoice(BaseModel): logprobs: OpenAIChoiceLogprobs | None = None +class OpenAIChatCompletionUsageCompletionTokensDetails(BaseModel): + """Token details for output tokens in OpenAI chat completion usage. + + :param reasoning_tokens: Number of tokens used for reasoning (o1/o3 models) + """ + + reasoning_tokens: int | None = None + + +class OpenAIChatCompletionUsagePromptTokensDetails(BaseModel): + """Token details for prompt tokens in OpenAI chat completion usage. + + :param cached_tokens: Number of tokens retrieved from cache + """ + + cached_tokens: int | None = None + + +@json_schema_type +class OpenAIChatCompletionUsage(BaseModel): + """Usage information for OpenAI chat completion. + + :param prompt_tokens: Number of tokens in the prompt + :param completion_tokens: Number of tokens in the completion + :param total_tokens: Total tokens used (prompt + completion) + :param input_tokens_details: Detailed breakdown of input token usage + :param output_tokens_details: Detailed breakdown of output token usage + """ + + prompt_tokens: int + completion_tokens: int + total_tokens: int + prompt_tokens_details: OpenAIChatCompletionUsagePromptTokensDetails | None = None + completion_tokens_details: OpenAIChatCompletionUsageCompletionTokensDetails | None = None + + @json_schema_type class OpenAIChatCompletion(BaseModel): """Response from an OpenAI-compatible chat completion request. @@ -825,6 +861,7 @@ class OpenAIChatCompletion(BaseModel): :param object: The object type, which will be "chat.completion" :param created: The Unix timestamp in seconds when the chat completion was created :param model: The model that was used to generate the chat completion + :param usage: Token usage information for the completion """ id: str @@ -832,6 +869,7 @@ class OpenAIChatCompletion(BaseModel): object: Literal["chat.completion"] = "chat.completion" created: int model: str + usage: OpenAIChatCompletionUsage | None = None @json_schema_type @@ -843,6 +881,7 @@ class OpenAIChatCompletionChunk(BaseModel): :param object: The object type, which will be "chat.completion.chunk" :param created: The Unix timestamp in seconds when the chat completion was created :param model: The model that was used to generate the chat completion + :param usage: Token usage information (typically included in final chunk with stream_options) """ id: str @@ -850,6 +889,7 @@ class OpenAIChatCompletionChunk(BaseModel): object: Literal["chat.completion.chunk"] = "chat.completion.chunk" created: int model: str + usage: OpenAIChatCompletionUsage | None = None @json_schema_type