test-fireworks-fix

This commit is contained in:
Swapna Lekkala 2025-09-11 15:58:38 -07:00
parent 69a52213a1
commit f9348a6bdf
4 changed files with 210 additions and 2 deletions

View file

@ -6304,6 +6304,9 @@
"$ref": "#/components/schemas/TokenLogProbs" "$ref": "#/components/schemas/TokenLogProbs"
}, },
"description": "Optional log probabilities for generated tokens" "description": "Optional log probabilities for generated tokens"
},
"usage": {
"$ref": "#/components/schemas/UsageInfo"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -6362,6 +6365,31 @@
"title": "TokenLogProbs", "title": "TokenLogProbs",
"description": "Log probabilities for generated tokens." "description": "Log probabilities for generated tokens."
}, },
"UsageInfo": {
"type": "object",
"properties": {
"completion_tokens": {
"type": "integer",
"description": "Number of tokens generated"
},
"prompt_tokens": {
"type": "integer",
"description": "Number of tokens in the prompt"
},
"total_tokens": {
"type": "integer",
"description": "Total number of tokens processed"
}
},
"additionalProperties": false,
"required": [
"completion_tokens",
"prompt_tokens",
"total_tokens"
],
"title": "UsageInfo",
"description": "Usage information for a model."
},
"BatchCompletionRequest": { "BatchCompletionRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -10871,6 +10899,31 @@
"title": "OpenAIChatCompletionToolCallFunction", "title": "OpenAIChatCompletionToolCallFunction",
"description": "Function call details for OpenAI-compatible tool calls." "description": "Function call details for OpenAI-compatible tool calls."
}, },
"OpenAIChatCompletionUsage": {
"type": "object",
"properties": {
"prompt_tokens": {
"type": "integer",
"description": "The number of tokens in the prompt"
},
"completion_tokens": {
"type": "integer",
"description": "The number of tokens in the completion"
},
"total_tokens": {
"type": "integer",
"description": "The total number of tokens used"
}
},
"additionalProperties": false,
"required": [
"prompt_tokens",
"completion_tokens",
"total_tokens"
],
"title": "OpenAIChatCompletionUsage",
"description": "Usage information for an OpenAI-compatible chat completion response."
},
"OpenAIChoice": { "OpenAIChoice": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -11208,6 +11261,13 @@
"OpenAICompletionWithInputMessages": { "OpenAICompletionWithInputMessages": {
"type": "object", "type": "object",
"properties": { "properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"id": { "id": {
"type": "string", "type": "string",
"description": "The ID of the chat completion" "description": "The ID of the chat completion"
@ -11233,6 +11293,9 @@
"type": "string", "type": "string",
"description": "The model that was used to generate the chat completion" "description": "The model that was used to generate the chat completion"
}, },
"usage": {
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
},
"input_messages": { "input_messages": {
"type": "array", "type": "array",
"items": { "items": {
@ -12994,6 +13057,13 @@
"items": { "items": {
"type": "object", "type": "object",
"properties": { "properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"id": { "id": {
"type": "string", "type": "string",
"description": "The ID of the chat completion" "description": "The ID of the chat completion"
@ -13019,6 +13089,9 @@
"type": "string", "type": "string",
"description": "The model that was used to generate the chat completion" "description": "The model that was used to generate the chat completion"
}, },
"usage": {
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
},
"input_messages": { "input_messages": {
"type": "array", "type": "array",
"items": { "items": {
@ -14410,6 +14483,13 @@
"OpenAIChatCompletion": { "OpenAIChatCompletion": {
"type": "object", "type": "object",
"properties": { "properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"id": { "id": {
"type": "string", "type": "string",
"description": "The ID of the chat completion" "description": "The ID of the chat completion"
@ -14434,6 +14514,9 @@
"model": { "model": {
"type": "string", "type": "string",
"description": "The model that was used to generate the chat completion" "description": "The model that was used to generate the chat completion"
},
"usage": {
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
} }
}, },
"additionalProperties": false, "additionalProperties": false,

View file

@ -4499,6 +4499,8 @@ components:
$ref: '#/components/schemas/TokenLogProbs' $ref: '#/components/schemas/TokenLogProbs'
description: >- description: >-
Optional log probabilities for generated tokens Optional log probabilities for generated tokens
usage:
$ref: '#/components/schemas/UsageInfo'
additionalProperties: false additionalProperties: false
required: required:
- completion_message - completion_message
@ -4540,6 +4542,25 @@ components:
- logprobs_by_token - logprobs_by_token
title: TokenLogProbs title: TokenLogProbs
description: Log probabilities for generated tokens. description: Log probabilities for generated tokens.
UsageInfo:
type: object
properties:
completion_tokens:
type: integer
description: Number of tokens generated
prompt_tokens:
type: integer
description: Number of tokens in the prompt
total_tokens:
type: integer
description: Total number of tokens processed
additionalProperties: false
required:
- completion_tokens
- prompt_tokens
- total_tokens
title: UsageInfo
description: Usage information for a model.
BatchCompletionRequest: BatchCompletionRequest:
type: object type: object
properties: properties:
@ -8054,6 +8075,26 @@ components:
title: OpenAIChatCompletionToolCallFunction title: OpenAIChatCompletionToolCallFunction
description: >- description: >-
Function call details for OpenAI-compatible tool calls. Function call details for OpenAI-compatible tool calls.
OpenAIChatCompletionUsage:
type: object
properties:
prompt_tokens:
type: integer
description: The number of tokens in the prompt
completion_tokens:
type: integer
description: The number of tokens in the completion
total_tokens:
type: integer
description: The total number of tokens used
additionalProperties: false
required:
- prompt_tokens
- completion_tokens
- total_tokens
title: OpenAIChatCompletionUsage
description: >-
Usage information for an OpenAI-compatible chat completion response.
OpenAIChoice: OpenAIChoice:
type: object type: object
properties: properties:
@ -8316,6 +8357,12 @@ components:
OpenAICompletionWithInputMessages: OpenAICompletionWithInputMessages:
type: object type: object
properties: properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
id: id:
type: string type: string
description: The ID of the chat completion description: The ID of the chat completion
@ -8338,6 +8385,8 @@ components:
type: string type: string
description: >- description: >-
The model that was used to generate the chat completion The model that was used to generate the chat completion
usage:
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
input_messages: input_messages:
type: array type: array
items: items:
@ -9633,6 +9682,12 @@ components:
items: items:
type: object type: object
properties: properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
id: id:
type: string type: string
description: The ID of the chat completion description: The ID of the chat completion
@ -9655,6 +9710,8 @@ components:
type: string type: string
description: >- description: >-
The model that was used to generate the chat completion The model that was used to generate the chat completion
usage:
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
input_messages: input_messages:
type: array type: array
items: items:
@ -10670,6 +10727,12 @@ components:
OpenAIChatCompletion: OpenAIChatCompletion:
type: object type: object
properties: properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
id: id:
type: string type: string
description: The ID of the chat completion description: The ID of the chat completion
@ -10692,6 +10755,8 @@ components:
type: string type: string
description: >- description: >-
The model that was used to generate the chat completion The model that was used to generate the chat completion
usage:
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
additionalProperties: false additionalProperties: false
required: required:
- id - id

View file

@ -451,6 +451,20 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
event: ChatCompletionResponseEvent event: ChatCompletionResponseEvent
@json_schema_type
class UsageInfo(BaseModel):
"""Usage information for a model.
:param completion_tokens: Number of tokens generated
:param prompt_tokens: Number of tokens in the prompt
:param total_tokens: Total number of tokens processed
"""
completion_tokens: int
prompt_tokens: int
total_tokens: int
@json_schema_type @json_schema_type
class ChatCompletionResponse(MetricResponseMixin): class ChatCompletionResponse(MetricResponseMixin):
"""Response from a chat completion request. """Response from a chat completion request.
@ -461,6 +475,7 @@ class ChatCompletionResponse(MetricResponseMixin):
completion_message: CompletionMessage completion_message: CompletionMessage
logprobs: list[TokenLogProbs] | None = None logprobs: list[TokenLogProbs] | None = None
usage: UsageInfo | None = None
@json_schema_type @json_schema_type
@ -818,7 +833,21 @@ class OpenAIChoice(BaseModel):
@json_schema_type @json_schema_type
class OpenAIChatCompletion(BaseModel): class OpenAIChatCompletionUsage(BaseModel):
"""Usage information for an OpenAI-compatible chat completion response.
:param prompt_tokens: The number of tokens in the prompt
:param completion_tokens: The number of tokens in the completion
:param total_tokens: The total number of tokens used
"""
prompt_tokens: int
completion_tokens: int
total_tokens: int
@json_schema_type
class OpenAIChatCompletion(MetricResponseMixin):
"""Response from an OpenAI-compatible chat completion request. """Response from an OpenAI-compatible chat completion request.
:param id: The ID of the chat completion :param id: The ID of the chat completion
@ -833,6 +862,7 @@ class OpenAIChatCompletion(BaseModel):
object: Literal["chat.completion"] = "chat.completion" object: Literal["chat.completion"] = "chat.completion"
created: int created: int
model: str model: str
usage: OpenAIChatCompletionUsage | None = None
@json_schema_type @json_schema_type

View file

@ -31,6 +31,8 @@ from openai.types.chat import (
ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
) )
from llama_stack.apis.inference.inference import UsageInfo
try: try:
from openai.types.chat import ( from openai.types.chat import (
ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall, ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
@ -103,6 +105,7 @@ from llama_stack.apis.inference import (
JsonSchemaResponseFormat, JsonSchemaResponseFormat,
Message, Message,
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAIChatCompletionUsage,
OpenAICompletion, OpenAICompletion,
OpenAICompletionChoice, OpenAICompletionChoice,
OpenAIEmbeddingData, OpenAIEmbeddingData,
@ -277,6 +280,11 @@ def process_chat_completion_response(
request: ChatCompletionRequest, request: ChatCompletionRequest,
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
choice = response.choices[0] choice = response.choices[0]
usage = UsageInfo(
prompt_tokens=response.usage.prompt_tokens,
completion_tokens=response.usage.completion_tokens,
total_tokens=response.usage.total_tokens,
)
if choice.finish_reason == "tool_calls": if choice.finish_reason == "tool_calls":
if not choice.message or not choice.message.tool_calls: if not choice.message or not choice.message.tool_calls:
raise ValueError("Tool calls are not present in the response") raise ValueError("Tool calls are not present in the response")
@ -290,6 +298,7 @@ def process_chat_completion_response(
content=json.dumps(tool_calls, default=lambda x: x.model_dump()), content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
), ),
logprobs=None, logprobs=None,
usage=usage,
) )
else: else:
# Otherwise, return tool calls as normal # Otherwise, return tool calls as normal
@ -301,6 +310,7 @@ def process_chat_completion_response(
content="", content="",
), ),
logprobs=None, logprobs=None,
usage=usage,
) )
# TODO: This does not work well with tool calls for vLLM remote provider # TODO: This does not work well with tool calls for vLLM remote provider
@ -335,6 +345,7 @@ def process_chat_completion_response(
tool_calls=raw_message.tool_calls, tool_calls=raw_message.tool_calls,
), ),
logprobs=None, logprobs=None,
usage=usage,
) )
@ -1375,6 +1386,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
user: str | None = None, user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
messages = openai_messages_to_messages(messages) messages = openai_messages_to_messages(messages)
response_format = _convert_openai_request_response_format(response_format) response_format = _convert_openai_request_response_format(response_format)
sampling_params = _convert_openai_sampling_params( sampling_params = _convert_openai_sampling_params(
max_tokens=max_tokens, max_tokens=max_tokens,
@ -1405,9 +1417,10 @@ class OpenAIChatCompletionToLlamaStackMixin:
if stream: if stream:
return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses) return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response( response = await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
self, model, outstanding_responses self, model, outstanding_responses
) )
return response
async def _process_stream_response( async def _process_stream_response(
self, self,
@ -1476,12 +1489,22 @@ class OpenAIChatCompletionToLlamaStackMixin:
self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]] self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
) -> OpenAIChatCompletion: ) -> OpenAIChatCompletion:
choices = [] choices = []
total_prompt_tokens = 0
total_completion_tokens = 0
total_tokens = 0
for outstanding_response in outstanding_responses: for outstanding_response in outstanding_responses:
response = await outstanding_response response = await outstanding_response
completion_message = response.completion_message completion_message = response.completion_message
message = await convert_message_to_openai_dict_new(completion_message) message = await convert_message_to_openai_dict_new(completion_message)
finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason) finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
# Aggregate usage data
if response.usage:
total_prompt_tokens += response.usage.prompt_tokens
total_completion_tokens += response.usage.completion_tokens
total_tokens += response.usage.total_tokens
choice = OpenAIChatCompletionChoice( choice = OpenAIChatCompletionChoice(
index=len(choices), index=len(choices),
message=message, message=message,
@ -1489,12 +1512,19 @@ class OpenAIChatCompletionToLlamaStackMixin:
) )
choices.append(choice) choices.append(choice)
usage = None
if total_tokens > 0:
usage = OpenAIChatCompletionUsage(
prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens
)
return OpenAIChatCompletion( return OpenAIChatCompletion(
id=f"chatcmpl-{uuid.uuid4()}", id=f"chatcmpl-{uuid.uuid4()}",
choices=choices, choices=choices,
created=int(time.time()), created=int(time.time()),
model=model, model=model,
object="chat.completion", object="chat.completion",
usage=usage,
) )