mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
Test fixes in openai_compat
This commit is contained in:
parent
e56a3f266c
commit
d60514b57b
7 changed files with 221 additions and 7 deletions
83
docs/_static/llama-stack-spec.html
vendored
83
docs/_static/llama-stack-spec.html
vendored
|
@ -6372,6 +6372,9 @@
|
|||
"$ref": "#/components/schemas/TokenLogProbs"
|
||||
},
|
||||
"description": "Optional log probabilities for generated tokens"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/UsageInfo"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -6430,6 +6433,31 @@
|
|||
"title": "TokenLogProbs",
|
||||
"description": "Log probabilities for generated tokens."
|
||||
},
|
||||
"UsageInfo": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"completion_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens generated"
|
||||
},
|
||||
"prompt_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Number of tokens in the prompt"
|
||||
},
|
||||
"total_tokens": {
|
||||
"type": "integer",
|
||||
"description": "Total number of tokens processed"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"completion_tokens",
|
||||
"prompt_tokens",
|
||||
"total_tokens"
|
||||
],
|
||||
"title": "UsageInfo",
|
||||
"description": "Usage information for a model."
|
||||
},
|
||||
"BatchCompletionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -10939,6 +10967,31 @@
|
|||
"title": "OpenAIChatCompletionToolCallFunction",
|
||||
"description": "Function call details for OpenAI-compatible tool calls."
|
||||
},
|
||||
"OpenAIChatCompletionUsage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt_tokens": {
|
||||
"type": "integer",
|
||||
"description": "The number of tokens in the prompt"
|
||||
},
|
||||
"completion_tokens": {
|
||||
"type": "integer",
|
||||
"description": "The number of tokens in the completion"
|
||||
},
|
||||
"total_tokens": {
|
||||
"type": "integer",
|
||||
"description": "The total number of tokens used"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"prompt_tokens",
|
||||
"completion_tokens",
|
||||
"total_tokens"
|
||||
],
|
||||
"title": "OpenAIChatCompletionUsage",
|
||||
"description": "Usage information for an OpenAI-compatible chat completion response."
|
||||
},
|
||||
"OpenAIChoice": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -11276,6 +11329,13 @@
|
|||
"OpenAICompletionWithInputMessages": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the chat completion"
|
||||
|
@ -11301,6 +11361,9 @@
|
|||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
|
||||
},
|
||||
"input_messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -13062,6 +13125,13 @@
|
|||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the chat completion"
|
||||
|
@ -13087,6 +13157,9 @@
|
|||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
|
||||
},
|
||||
"input_messages": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
@ -14478,6 +14551,13 @@
|
|||
"OpenAIChatCompletion": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the chat completion"
|
||||
|
@ -14502,6 +14582,9 @@
|
|||
"model": {
|
||||
"type": "string",
|
||||
"description": "The model that was used to generate the chat completion"
|
||||
},
|
||||
"usage": {
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionUsage"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
65
docs/_static/llama-stack-spec.yaml
vendored
65
docs/_static/llama-stack-spec.yaml
vendored
|
@ -4548,6 +4548,8 @@ components:
|
|||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
usage:
|
||||
$ref: '#/components/schemas/UsageInfo'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- completion_message
|
||||
|
@ -4589,6 +4591,25 @@ components:
|
|||
- logprobs_by_token
|
||||
title: TokenLogProbs
|
||||
description: Log probabilities for generated tokens.
|
||||
UsageInfo:
|
||||
type: object
|
||||
properties:
|
||||
completion_tokens:
|
||||
type: integer
|
||||
description: Number of tokens generated
|
||||
prompt_tokens:
|
||||
type: integer
|
||||
description: Number of tokens in the prompt
|
||||
total_tokens:
|
||||
type: integer
|
||||
description: Total number of tokens processed
|
||||
additionalProperties: false
|
||||
required:
|
||||
- completion_tokens
|
||||
- prompt_tokens
|
||||
- total_tokens
|
||||
title: UsageInfo
|
||||
description: Usage information for a model.
|
||||
BatchCompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -8103,6 +8124,26 @@ components:
|
|||
title: OpenAIChatCompletionToolCallFunction
|
||||
description: >-
|
||||
Function call details for OpenAI-compatible tool calls.
|
||||
OpenAIChatCompletionUsage:
|
||||
type: object
|
||||
properties:
|
||||
prompt_tokens:
|
||||
type: integer
|
||||
description: The number of tokens in the prompt
|
||||
completion_tokens:
|
||||
type: integer
|
||||
description: The number of tokens in the completion
|
||||
total_tokens:
|
||||
type: integer
|
||||
description: The total number of tokens used
|
||||
additionalProperties: false
|
||||
required:
|
||||
- prompt_tokens
|
||||
- completion_tokens
|
||||
- total_tokens
|
||||
title: OpenAIChatCompletionUsage
|
||||
description: >-
|
||||
Usage information for an OpenAI-compatible chat completion response.
|
||||
OpenAIChoice:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -8365,6 +8406,12 @@ components:
|
|||
OpenAICompletionWithInputMessages:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
id:
|
||||
type: string
|
||||
description: The ID of the chat completion
|
||||
|
@ -8387,6 +8434,8 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
The model that was used to generate the chat completion
|
||||
usage:
|
||||
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
|
||||
input_messages:
|
||||
type: array
|
||||
items:
|
||||
|
@ -9682,6 +9731,12 @@ components:
|
|||
items:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
id:
|
||||
type: string
|
||||
description: The ID of the chat completion
|
||||
|
@ -9704,6 +9759,8 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
The model that was used to generate the chat completion
|
||||
usage:
|
||||
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
|
||||
input_messages:
|
||||
type: array
|
||||
items:
|
||||
|
@ -10719,6 +10776,12 @@ components:
|
|||
OpenAIChatCompletion:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
id:
|
||||
type: string
|
||||
description: The ID of the chat completion
|
||||
|
@ -10741,6 +10804,8 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
The model that was used to generate the chat completion
|
||||
usage:
|
||||
$ref: '#/components/schemas/OpenAIChatCompletionUsage'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- id
|
||||
|
|
|
@ -451,6 +451,20 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
|
|||
event: ChatCompletionResponseEvent
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class UsageInfo(BaseModel):
|
||||
"""Usage information for a model.
|
||||
|
||||
:param completion_tokens: Number of tokens generated
|
||||
:param prompt_tokens: Number of tokens in the prompt
|
||||
:param total_tokens: Total number of tokens processed
|
||||
"""
|
||||
|
||||
completion_tokens: int
|
||||
prompt_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class ChatCompletionResponse(MetricResponseMixin):
|
||||
"""Response from a chat completion request.
|
||||
|
@ -461,6 +475,7 @@ class ChatCompletionResponse(MetricResponseMixin):
|
|||
|
||||
completion_message: CompletionMessage
|
||||
logprobs: list[TokenLogProbs] | None = None
|
||||
usage: UsageInfo | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
@ -818,7 +833,21 @@ class OpenAIChoice(BaseModel):
|
|||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIChatCompletion(BaseModel):
|
||||
class OpenAIChatCompletionUsage(BaseModel):
|
||||
"""Usage information for an OpenAI-compatible chat completion response.
|
||||
|
||||
:param prompt_tokens: The number of tokens in the prompt
|
||||
:param completion_tokens: The number of tokens in the completion
|
||||
:param total_tokens: The total number of tokens used
|
||||
"""
|
||||
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_tokens: int
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIChatCompletion(MetricResponseMixin):
|
||||
"""Response from an OpenAI-compatible chat completion request.
|
||||
|
||||
:param id: The ID of the chat completion
|
||||
|
@ -833,6 +862,7 @@ class OpenAIChatCompletion(BaseModel):
|
|||
object: Literal["chat.completion"] = "chat.completion"
|
||||
created: int
|
||||
model: str
|
||||
usage: OpenAIChatCompletionUsage | None = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
@ -590,6 +590,7 @@ class InferenceRouter(Inference):
|
|||
|
||||
async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion:
|
||||
response = await provider.openai_chat_completion(**params)
|
||||
|
||||
for choice in response.choices:
|
||||
# some providers return an empty list for no tool calls in non-streaming responses
|
||||
# but the OpenAI API returns None. So, set tool_calls to None if it's empty
|
||||
|
@ -739,7 +740,6 @@ class InferenceRouter(Inference):
|
|||
id = None
|
||||
created = None
|
||||
choices_data: dict[int, dict[str, Any]] = {}
|
||||
|
||||
try:
|
||||
async for chunk in response:
|
||||
# Skip None chunks
|
||||
|
|
|
@ -130,7 +130,7 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
|
|||
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
||||
stream = self.client.completions.create(**params)
|
||||
stream = await self.client.completions.create(**params)
|
||||
async for chunk in process_completion_stream_response(stream):
|
||||
yield chunk
|
||||
|
||||
|
@ -208,9 +208,9 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
|
|||
params = await self._get_params(request)
|
||||
|
||||
if "messages" in params:
|
||||
stream = self.client.chat.completions.create(**params)
|
||||
stream = await self.client.chat.completions.create(**params)
|
||||
else:
|
||||
stream = self.client.completions.create(**params)
|
||||
stream = await self.client.completions.create(**params)
|
||||
async for chunk in process_chat_completion_stream_response(stream, request):
|
||||
yield chunk
|
||||
|
||||
|
|
|
@ -31,6 +31,8 @@ from openai.types.chat import (
|
|||
ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
|
||||
)
|
||||
|
||||
from llama_stack.apis.inference.inference import UsageInfo
|
||||
|
||||
try:
|
||||
from openai.types.chat import (
|
||||
ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
|
||||
|
@ -103,6 +105,7 @@ from llama_stack.apis.inference import (
|
|||
JsonSchemaResponseFormat,
|
||||
Message,
|
||||
OpenAIChatCompletion,
|
||||
OpenAIChatCompletionUsage,
|
||||
OpenAICompletion,
|
||||
OpenAICompletionChoice,
|
||||
OpenAIEmbeddingData,
|
||||
|
@ -277,6 +280,11 @@ def process_chat_completion_response(
|
|||
request: ChatCompletionRequest,
|
||||
) -> ChatCompletionResponse:
|
||||
choice = response.choices[0]
|
||||
usage = UsageInfo(
|
||||
prompt_tokens=response.usage.prompt_tokens,
|
||||
completion_tokens=response.usage.completion_tokens,
|
||||
total_tokens=response.usage.total_tokens,
|
||||
)
|
||||
if choice.finish_reason == "tool_calls":
|
||||
if not choice.message or not choice.message.tool_calls:
|
||||
raise ValueError("Tool calls are not present in the response")
|
||||
|
@ -290,6 +298,7 @@ def process_chat_completion_response(
|
|||
content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
|
||||
),
|
||||
logprobs=None,
|
||||
usage=usage,
|
||||
)
|
||||
else:
|
||||
# Otherwise, return tool calls as normal
|
||||
|
@ -301,6 +310,7 @@ def process_chat_completion_response(
|
|||
content="",
|
||||
),
|
||||
logprobs=None,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
# TODO: This does not work well with tool calls for vLLM remote provider
|
||||
|
@ -335,6 +345,7 @@ def process_chat_completion_response(
|
|||
tool_calls=raw_message.tool_calls,
|
||||
),
|
||||
logprobs=None,
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
|
||||
|
@ -646,7 +657,7 @@ async def convert_message_to_openai_dict_new(
|
|||
arguments=json.dumps(tool.arguments),
|
||||
),
|
||||
type="function",
|
||||
)
|
||||
).model_dump()
|
||||
for tool in message.tool_calls
|
||||
]
|
||||
params = {}
|
||||
|
@ -657,6 +668,7 @@ async def convert_message_to_openai_dict_new(
|
|||
content=await _convert_message_content(message.content),
|
||||
**params,
|
||||
)
|
||||
|
||||
elif isinstance(message, ToolResponseMessage):
|
||||
out = OpenAIChatCompletionToolMessage(
|
||||
role="tool",
|
||||
|
@ -1375,6 +1387,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
|||
user: str | None = None,
|
||||
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
||||
messages = openai_messages_to_messages(messages)
|
||||
|
||||
response_format = _convert_openai_request_response_format(response_format)
|
||||
sampling_params = _convert_openai_sampling_params(
|
||||
max_tokens=max_tokens,
|
||||
|
@ -1401,7 +1414,6 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
|||
tools=tools,
|
||||
)
|
||||
outstanding_responses.append(response)
|
||||
|
||||
if stream:
|
||||
return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
|
||||
|
||||
|
@ -1476,12 +1488,22 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
|||
self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
|
||||
) -> OpenAIChatCompletion:
|
||||
choices = []
|
||||
total_prompt_tokens = 0
|
||||
total_completion_tokens = 0
|
||||
total_tokens = 0
|
||||
|
||||
for outstanding_response in outstanding_responses:
|
||||
response = await outstanding_response
|
||||
completion_message = response.completion_message
|
||||
message = await convert_message_to_openai_dict_new(completion_message)
|
||||
finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
|
||||
|
||||
# Aggregate usage data
|
||||
if response.usage:
|
||||
total_prompt_tokens += response.usage.prompt_tokens
|
||||
total_completion_tokens += response.usage.completion_tokens
|
||||
total_tokens += response.usage.total_tokens
|
||||
|
||||
choice = OpenAIChatCompletionChoice(
|
||||
index=len(choices),
|
||||
message=message,
|
||||
|
@ -1489,12 +1511,17 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
|||
)
|
||||
choices.append(choice)
|
||||
|
||||
usage = OpenAIChatCompletionUsage(
|
||||
prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens
|
||||
)
|
||||
|
||||
return OpenAIChatCompletion(
|
||||
id=f"chatcmpl-{uuid.uuid4()}",
|
||||
choices=choices,
|
||||
created=int(time.time()),
|
||||
model=model,
|
||||
object="chat.completion",
|
||||
usage=usage,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -108,6 +108,15 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
|
|||
"embedding_model": "together/togethercomputer/m2-bert-80M-32k-retrieval",
|
||||
},
|
||||
),
|
||||
"fireworks": Setup(
|
||||
name="fireworks",
|
||||
description="Fireworks provider with a text model",
|
||||
defaults={
|
||||
"text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
|
||||
"vision_model": "fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
|
||||
"embedding_model": "nomic-ai/nomic-embed-text-v1.5",
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue