mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-11 13:44:38 +00:00
feat(responses): add usage types to inference and responses APIs (#3764)
Some checks failed
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 2s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Vector IO Integration Tests / test-matrix (push) Failing after 6s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
Test External API and Providers / test-external (venv) (push) Failing after 6s
Unit Tests / unit-tests (3.13) (push) Failing after 4s
Python Package Build Test / build (3.13) (push) Failing after 23s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 27s
API Conformance Tests / check-schema-compatibility (push) Successful in 36s
UI Tests / ui-tests (22) (push) Successful in 55s
Pre-commit / pre-commit (push) Successful in 2m7s
Some checks failed
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 0s
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 0s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 4s
Python Package Build Test / build (3.12) (push) Failing after 2s
Test External Providers Installed via Module / test-external-providers-from-module (venv) (push) Has been skipped
Vector IO Integration Tests / test-matrix (push) Failing after 6s
Unit Tests / unit-tests (3.12) (push) Failing after 4s
Test External API and Providers / test-external (venv) (push) Failing after 6s
Unit Tests / unit-tests (3.13) (push) Failing after 4s
Python Package Build Test / build (3.13) (push) Failing after 23s
Integration Tests (Replay) / Integration Tests (, , , client=, ) (push) Failing after 27s
API Conformance Tests / check-schema-compatibility (push) Successful in 36s
UI Tests / ui-tests (22) (push) Successful in 55s
Pre-commit / pre-commit (push) Successful in 2m7s
## Summary Adds OpenAI-compatible usage tracking types to enable reporting token consumption for both streaming and non-streaming responses. ## Type Definitions **Chat Completion Usage** (inference API): ```python class OpenAIChatCompletionUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int prompt_tokens_details: OpenAIChatCompletionUsagePromptTokensDetails | None completion_tokens_details: OpenAIChatCompletionUsageCompletionTokensDetails | None ``` **Response Usage** (responses API): ```python class OpenAIResponseUsage(BaseModel): input_tokens: int output_tokens: int total_tokens: int input_tokens_details: OpenAIResponseUsageInputTokensDetails | None output_tokens_details: OpenAIResponseUsageOutputTokensDetails | None ``` This matches OpenAI's usage reporting format and enables PR #3766 to implement usage tracking in streaming responses. Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
ebae0385bb
commit
aaf5036235
8 changed files with 747 additions and 0 deletions
120
docs/static/llama-stack-spec.html
vendored
120
docs/static/llama-stack-spec.html
vendored
|
@ -4277,6 +4277,10 @@
|
|||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage",
|
||||
"description": "Token usage information for the completion"
|
||||
},
|
||||
"input_messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -4479,6 +4483,55 @@
|
|||
"title": "OpenAIChatCompletionToolCallFunction",
|
||||
"description": "Function call details for OpenAI-compatible tool calls."
|
||||
},
|
||||
"OpenAIChatCompletionUsage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens in the prompt"
|
||||
},
|
||||
"completion_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens in the completion"
|
||||
},
|
||||
"total_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Total tokens used (prompt + completion)"
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cached_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens retrieved from cache"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"title": "OpenAIChatCompletionUsagePromptTokensDetails",
|
||||
"description": "Token details for prompt tokens in OpenAI chat completion usage."
|
||||
},
|
||||
"completion_tokens_details": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"reasoning_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens used for reasoning (o1/o3 models)"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"title": "OpenAIChatCompletionUsageCompletionTokensDetails",
|
||||
"description": "Token details for output tokens in OpenAI chat completion usage."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"prompt_tokens",
|
||||
"completion_tokens",
|
||||
"total_tokens"
|
||||
],
|
||||
"title": "OpenAIChatCompletionUsage",
|
||||
"description": "Usage information for OpenAI chat completion."
|
||||
},
|
||||
"OpenAIChoice": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -5241,6 +5294,10 @@
|
|||
"model": {
|
||||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage",
|
||||
"description": "Token usage information for the completion"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -5281,6 +5338,10 @@
|
|||
"model": {
|
||||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage",
|
||||
"description": "Token usage information (typically included in final chunk with stream_options)"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -5378,6 +5439,10 @@
|
|||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage",
|
||||
"description": "Token usage information for the completion"
|
||||
},
|
||||
"input_messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -7503,6 +7568,10 @@
|
|||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseUsage",
|
||||
"description": "(Optional) Token usage information for the response"
|
||||
},
|
||||
"input": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -7636,6 +7705,53 @@
|
|||
"title": "OpenAIResponseText",
|
||||
"description": "Text response configuration for OpenAI responses."
|
||||
},
|
||||
"OpenAIResponseUsage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens in the input"
|
||||
},
|
||||
"output_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens in the output"
|
||||
},
|
||||
"total_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Total tokens used (input + output)"
|
||||
},
|
||||
"input_tokens_details": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cached_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens retrieved from cache"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "Detailed breakdown of input token usage"
|
||||
},
|
||||
"output_tokens_details": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"reasoning_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens used for reasoning (o1/o3 models)"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "Detailed breakdown of output token usage"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"input_tokens",
|
||||
"output_tokens",
|
||||
"total_tokens"
|
||||
],
|
||||
"title": "OpenAIResponseUsage",
|
||||
"description": "Usage information for OpenAI response."
|
||||
},
|
||||
"ResponseShieldSpec": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -8078,6 +8194,10 @@
|
|||
"truncation": {
|
||||
"type": "string",
|
||||
"description": "(Optional) Truncation strategy applied to the response"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseUsage",
|
||||
"description": "(Optional) Token usage information for the response"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue