feat(responses): add usage types to inference and responses APIs

Add OpenAI-compatible usage tracking types: - OpenAIChatCompletionUsage with prompt/completion token counts - OpenAIResponseUsage with input/output token counts - Token detail types for cached_tokens and reasoning_tokens - Add usage field to chat completion and response objects This enables reporting token consumption for both streaming and non-streaming responses, matching OpenAI's usage reporting format. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-15 11:02:52 +00:00 · 2025-10-09 21:12:29 -07:00 · 2025-10-09 21:12:29 -07:00 · c92a1c99f0
commit c92a1c99f0
parent ebae0385bb
8 changed files with 747 additions and 0 deletions
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -4999,6 +4999,10 @@ components:
                type: string
                description: >-
                  The model that was used to generate the chat completion
+              usage:
+                $ref: '#/components/schemas/OpenAIChatCompletionUsage'
+                description: >-
+                  Token usage information for the completion
              input_messages:
                type: array
                items:
@ -5165,6 +5169,49 @@ components:
      title: OpenAIChatCompletionToolCallFunction
      description: >-
        Function call details for OpenAI-compatible tool calls.
+    OpenAIChatCompletionUsage:
+      type: object
+      properties:
+        prompt_tokens:
+          type: integer
+          description: Number of tokens in the prompt
+        completion_tokens:
+          type: integer
+          description: Number of tokens in the completion
+        total_tokens:
+          type: integer
+          description: Total tokens used (prompt + completion)
+        prompt_tokens_details:
+          type: object
+          properties:
+            cached_tokens:
+              type: integer
+              description: Number of tokens retrieved from cache
+          additionalProperties: false
+          title: >-
+            OpenAIChatCompletionUsagePromptTokensDetails
+          description: >-
+            Token details for prompt tokens in OpenAI chat completion usage.
+        completion_tokens_details:
+          type: object
+          properties:
+            reasoning_tokens:
+              type: integer
+              description: >-
+                Number of tokens used for reasoning (o1/o3 models)
+          additionalProperties: false
+          title: >-
+            OpenAIChatCompletionUsageCompletionTokensDetails
+          description: >-
+            Token details for output tokens in OpenAI chat completion usage.
+      additionalProperties: false
+      required:
+        - prompt_tokens
+        - completion_tokens
+        - total_tokens
+      title: OpenAIChatCompletionUsage
+      description: >-
+        Usage information for OpenAI chat completion.
    OpenAIChoice:
      type: object
      properties:
@ -5696,6 +5743,10 @@ components:
          type: string
          description: >-
            The model that was used to generate the chat completion
+        usage:
+          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
+          description: >-
+            Token usage information for the completion
      additionalProperties: false
      required:
        - id
@ -5731,6 +5782,10 @@ components:
          type: string
          description: >-
            The model that was used to generate the chat completion
+        usage:
+          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
+          description: >-
+            Token usage information (typically included in final chunk with stream_options)
      additionalProperties: false
      required:
        - id
@ -5811,6 +5866,10 @@ components:
          type: string
          description: >-
            The model that was used to generate the chat completion
+        usage:
+          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
+          description: >-
+            Token usage information for the completion
        input_messages:
          type: array
          items:
@ -6747,6 +6806,10 @@ components:
          type: string
          description: >-
            (Optional) Truncation strategy applied to the response
+        usage:
+          $ref: '#/components/schemas/OpenAIResponseUsage'
+          description: >-
+            (Optional) Token usage information for the response
        input:
          type: array
          items:
@ -7095,6 +7158,42 @@ components:
      title: OpenAIResponseText
      description: >-
        Text response configuration for OpenAI responses.
+    OpenAIResponseUsage:
+      type: object
+      properties:
+        input_tokens:
+          type: integer
+          description: Number of tokens in the input
+        output_tokens:
+          type: integer
+          description: Number of tokens in the output
+        total_tokens:
+          type: integer
+          description: Total tokens used (input + output)
+        input_tokens_details:
+          type: object
+          properties:
+            cached_tokens:
+              type: integer
+              description: Number of tokens retrieved from cache
+          additionalProperties: false
+          description: Detailed breakdown of input token usage
+        output_tokens_details:
+          type: object
+          properties:
+            reasoning_tokens:
+              type: integer
+              description: >-
+                Number of tokens used for reasoning (o1/o3 models)
+          additionalProperties: false
+          description: Detailed breakdown of output token usage
+      additionalProperties: false
+      required:
+        - input_tokens
+        - output_tokens
+        - total_tokens
+      title: OpenAIResponseUsage
+      description: Usage information for OpenAI response.
    ResponseShieldSpec:
      type: object
      properties:
@ -7421,6 +7520,10 @@ components:
          type: string
          description: >-
            (Optional) Truncation strategy applied to the response
+        usage:
+          $ref: '#/components/schemas/OpenAIResponseUsage'
+          description: >-
+            (Optional) Token usage information for the response
      additionalProperties: false
      required:
        - created_at