diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index fd782f6c9..9ddb070d7 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6372,9 +6372,6 @@ "$ref": "#/components/schemas/TokenLogProbs" }, "description": "Optional log probabilities for generated tokens" - }, - "usage": { - "$ref": "#/components/schemas/UsageInfo" } }, "additionalProperties": false, @@ -6433,31 +6430,6 @@ "title": "TokenLogProbs", "description": "Log probabilities for generated tokens." }, - "UsageInfo": { - "type": "object", - "properties": { - "completion_tokens": { - "type": "integer", - "description": "Number of tokens generated" - }, - "prompt_tokens": { - "type": "integer", - "description": "Number of tokens in the prompt" - }, - "total_tokens": { - "type": "integer", - "description": "Total number of tokens processed" - } - }, - "additionalProperties": false, - "required": [ - "completion_tokens", - "prompt_tokens", - "total_tokens" - ], - "title": "UsageInfo", - "description": "Usage information for a model." - }, "BatchCompletionRequest": { "type": "object", "properties": { @@ -10967,31 +10939,6 @@ "title": "OpenAIChatCompletionToolCallFunction", "description": "Function call details for OpenAI-compatible tool calls." }, - "OpenAIChatCompletionUsage": { - "type": "object", - "properties": { - "prompt_tokens": { - "type": "integer", - "description": "The number of tokens in the prompt" - }, - "completion_tokens": { - "type": "integer", - "description": "The number of tokens in the completion" - }, - "total_tokens": { - "type": "integer", - "description": "The total number of tokens used" - } - }, - "additionalProperties": false, - "required": [ - "prompt_tokens", - "completion_tokens", - "total_tokens" - ], - "title": "OpenAIChatCompletionUsage", - "description": "Usage information for an OpenAI-compatible chat completion response." - }, "OpenAIChoice": { "type": "object", "properties": { @@ -11329,13 +11276,6 @@ "OpenAICompletionWithInputMessages": { "type": "object", "properties": { - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricInResponse" - }, - "description": "(Optional) List of metrics associated with the API response" - }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -11361,9 +11301,6 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, - "usage": { - "$ref": "#/components/schemas/OpenAIChatCompletionUsage" - }, "input_messages": { "type": "array", "items": { @@ -13125,13 +13062,6 @@ "items": { "type": "object", "properties": { - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricInResponse" - }, - "description": "(Optional) List of metrics associated with the API response" - }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -13157,9 +13087,6 @@ "type": "string", "description": "The model that was used to generate the chat completion" }, - "usage": { - "$ref": "#/components/schemas/OpenAIChatCompletionUsage" - }, "input_messages": { "type": "array", "items": { @@ -14551,13 +14478,6 @@ "OpenAIChatCompletion": { "type": "object", "properties": { - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricInResponse" - }, - "description": "(Optional) List of metrics associated with the API response" - }, "id": { "type": "string", "description": "The ID of the chat completion" @@ -14582,9 +14502,6 @@ "model": { "type": "string", "description": "The model that was used to generate the chat completion" - }, - "usage": { - "$ref": "#/components/schemas/OpenAIChatCompletionUsage" } }, "additionalProperties": false, diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index d0096e268..94dc5c0f9 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4548,8 +4548,6 @@ components: $ref: '#/components/schemas/TokenLogProbs' description: >- Optional log probabilities for generated tokens - usage: - $ref: '#/components/schemas/UsageInfo' additionalProperties: false required: - completion_message @@ -4591,25 +4589,6 @@ components: - logprobs_by_token title: TokenLogProbs description: Log probabilities for generated tokens. - UsageInfo: - type: object - properties: - completion_tokens: - type: integer - description: Number of tokens generated - prompt_tokens: - type: integer - description: Number of tokens in the prompt - total_tokens: - type: integer - description: Total number of tokens processed - additionalProperties: false - required: - - completion_tokens - - prompt_tokens - - total_tokens - title: UsageInfo - description: Usage information for a model. BatchCompletionRequest: type: object properties: @@ -8124,26 +8103,6 @@ components: title: OpenAIChatCompletionToolCallFunction description: >- Function call details for OpenAI-compatible tool calls. - OpenAIChatCompletionUsage: - type: object - properties: - prompt_tokens: - type: integer - description: The number of tokens in the prompt - completion_tokens: - type: integer - description: The number of tokens in the completion - total_tokens: - type: integer - description: The total number of tokens used - additionalProperties: false - required: - - prompt_tokens - - completion_tokens - - total_tokens - title: OpenAIChatCompletionUsage - description: >- - Usage information for an OpenAI-compatible chat completion response. OpenAIChoice: type: object properties: @@ -8406,12 +8365,6 @@ components: OpenAICompletionWithInputMessages: type: object properties: - metrics: - type: array - items: - $ref: '#/components/schemas/MetricInResponse' - description: >- - (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -8434,8 +8387,6 @@ components: type: string description: >- The model that was used to generate the chat completion - usage: - $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -9731,12 +9682,6 @@ components: items: type: object properties: - metrics: - type: array - items: - $ref: '#/components/schemas/MetricInResponse' - description: >- - (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -9759,8 +9704,6 @@ components: type: string description: >- The model that was used to generate the chat completion - usage: - $ref: '#/components/schemas/OpenAIChatCompletionUsage' input_messages: type: array items: @@ -10776,12 +10719,6 @@ components: OpenAIChatCompletion: type: object properties: - metrics: - type: array - items: - $ref: '#/components/schemas/MetricInResponse' - description: >- - (Optional) List of metrics associated with the API response id: type: string description: The ID of the chat completion @@ -10804,8 +10741,6 @@ components: type: string description: >- The model that was used to generate the chat completion - usage: - $ref: '#/components/schemas/OpenAIChatCompletionUsage' additionalProperties: false required: - id diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 1b7869a30..bd4737ca7 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -451,20 +451,6 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin): event: ChatCompletionResponseEvent -@json_schema_type -class UsageInfo(BaseModel): - """Usage information for a model. - - :param completion_tokens: Number of tokens generated - :param prompt_tokens: Number of tokens in the prompt - :param total_tokens: Total number of tokens processed - """ - - completion_tokens: int - prompt_tokens: int - total_tokens: int - - @json_schema_type class ChatCompletionResponse(MetricResponseMixin): """Response from a chat completion request. @@ -475,7 +461,6 @@ class ChatCompletionResponse(MetricResponseMixin): completion_message: CompletionMessage logprobs: list[TokenLogProbs] | None = None - usage: UsageInfo | None = None @json_schema_type @@ -833,21 +818,7 @@ class OpenAIChoice(BaseModel): @json_schema_type -class OpenAIChatCompletionUsage(BaseModel): - """Usage information for an OpenAI-compatible chat completion response. - - :param prompt_tokens: The number of tokens in the prompt - :param completion_tokens: The number of tokens in the completion - :param total_tokens: The total number of tokens used - """ - - prompt_tokens: int - completion_tokens: int - total_tokens: int - - -@json_schema_type -class OpenAIChatCompletion(MetricResponseMixin): +class OpenAIChatCompletion(BaseModel): """Response from an OpenAI-compatible chat completion request. :param id: The ID of the chat completion @@ -862,7 +833,6 @@ class OpenAIChatCompletion(MetricResponseMixin): object: Literal["chat.completion"] = "chat.completion" created: int model: str - usage: OpenAIChatCompletionUsage | None = None @json_schema_type diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 00f3f5418..f4d37d558 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from collections.abc import AsyncGenerator, AsyncIterator +from collections.abc import AsyncGenerator from typing import Any from fireworks.client import Fireworks @@ -23,11 +23,7 @@ from llama_stack.apis.inference import ( Inference, LogProbConfig, Message, - OpenAIChatCompletion, - OpenAIChatCompletionChunk, OpenAICompletion, - OpenAIMessageParam, - OpenAIResponseFormatParam, ResponseFormat, ResponseFormatType, SamplingParams, @@ -43,7 +39,6 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( - OpenAIChatCompletionToLlamaStackMixin, convert_message_to_openai_dict, get_sampling_options, process_chat_completion_response, @@ -335,90 +330,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee prompt_logprobs=prompt_logprobs, suffix=suffix, ) - - async def openai_chat_completion( - self, - model: str, - messages: list[OpenAIMessageParam], - frequency_penalty: float | None = None, - function_call: str | dict[str, Any] | None = None, - functions: list[dict[str, Any]] | None = None, - logit_bias: dict[str, float] | None = None, - logprobs: bool | None = None, - max_completion_tokens: int | None = None, - max_tokens: int | None = None, - n: int | None = None, - parallel_tool_calls: bool | None = None, - presence_penalty: float | None = None, - response_format: OpenAIResponseFormatParam | None = None, - seed: int | None = None, - stop: str | list[str] | None = None, - stream: bool | None = None, - stream_options: dict[str, Any] | None = None, - temperature: float | None = None, - tool_choice: str | dict[str, Any] | None = None, - tools: list[dict[str, Any]] | None = None, - top_logprobs: int | None = None, - top_p: float | None = None, - user: str | None = None, - ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: - model_obj = await self.model_store.get_model(model) - - # Divert Llama Models through Llama Stack inference APIs because - # Fireworks chat completions OpenAI-compatible API does not support - # tool calls properly. - llama_model = self.get_llama_model(model_obj.provider_resource_id) - - if llama_model: - return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion( - self, - model=model, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) - - return await super().openai_chat_completion( - model=model, - messages=messages, - frequency_penalty=frequency_penalty, - function_call=function_call, - functions=functions, - logit_bias=logit_bias, - logprobs=logprobs, - max_completion_tokens=max_completion_tokens, - max_tokens=max_tokens, - n=n, - parallel_tool_calls=parallel_tool_calls, - presence_penalty=presence_penalty, - response_format=response_format, - seed=seed, - stop=stop, - stream=stream, - stream_options=stream_options, - temperature=temperature, - tool_choice=tool_choice, - tools=tools, - top_logprobs=top_logprobs, - top_p=top_p, - user=user, - ) diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py index 30807a0d4..444b8bf04 100644 --- a/llama_stack/providers/remote/inference/fireworks/models.py +++ b/llama_stack/providers/remote/inference/fireworks/models.py @@ -61,6 +61,7 @@ MODEL_ENTRIES = [ ), ProviderModelEntry( provider_model_id="nomic-ai/nomic-embed-text-v1.5", + aliases=["nomic-ai/nomic-embed-text-v1.5"], model_type=ModelType.embedding, metadata={ "embedding_dimension": 768, diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 3ef4fb134..55c2ac0ad 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -31,8 +31,6 @@ from openai.types.chat import ( ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam, ) -from llama_stack.apis.inference.inference import UsageInfo - try: from openai.types.chat import ( ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall, @@ -105,7 +103,6 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, Message, OpenAIChatCompletion, - OpenAIChatCompletionUsage, OpenAICompletion, OpenAICompletionChoice, OpenAIEmbeddingData, @@ -280,11 +277,6 @@ def process_chat_completion_response( request: ChatCompletionRequest, ) -> ChatCompletionResponse: choice = response.choices[0] - usage = UsageInfo( - prompt_tokens=response.usage.prompt_tokens, - completion_tokens=response.usage.completion_tokens, - total_tokens=response.usage.total_tokens, - ) if choice.finish_reason == "tool_calls": if not choice.message or not choice.message.tool_calls: raise ValueError("Tool calls are not present in the response") @@ -298,7 +290,6 @@ def process_chat_completion_response( content=json.dumps(tool_calls, default=lambda x: x.model_dump()), ), logprobs=None, - usage=usage, ) else: # Otherwise, return tool calls as normal @@ -310,7 +301,6 @@ def process_chat_completion_response( content="", ), logprobs=None, - usage=usage, ) # TODO: This does not work well with tool calls for vLLM remote provider @@ -345,7 +335,6 @@ def process_chat_completion_response( tool_calls=raw_message.tool_calls, ), logprobs=None, - usage=usage, ) @@ -657,7 +646,7 @@ async def convert_message_to_openai_dict_new( arguments=json.dumps(tool.arguments), ), type="function", - ).model_dump() + ) for tool in message.tool_calls ] params = {} @@ -668,7 +657,6 @@ async def convert_message_to_openai_dict_new( content=await _convert_message_content(message.content), **params, ) - elif isinstance(message, ToolResponseMessage): out = OpenAIChatCompletionToolMessage( role="tool", @@ -1387,7 +1375,6 @@ class OpenAIChatCompletionToLlamaStackMixin: user: str | None = None, ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: messages = openai_messages_to_messages(messages) - response_format = _convert_openai_request_response_format(response_format) sampling_params = _convert_openai_sampling_params( max_tokens=max_tokens, @@ -1414,6 +1401,7 @@ class OpenAIChatCompletionToLlamaStackMixin: tools=tools, ) outstanding_responses.append(response) + if stream: return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses) @@ -1488,22 +1476,12 @@ class OpenAIChatCompletionToLlamaStackMixin: self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]] ) -> OpenAIChatCompletion: choices = [] - total_prompt_tokens = 0 - total_completion_tokens = 0 - total_tokens = 0 - for outstanding_response in outstanding_responses: response = await outstanding_response completion_message = response.completion_message message = await convert_message_to_openai_dict_new(completion_message) finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason) - # Aggregate usage data - if response.usage: - total_prompt_tokens += response.usage.prompt_tokens - total_completion_tokens += response.usage.completion_tokens - total_tokens += response.usage.total_tokens - choice = OpenAIChatCompletionChoice( index=len(choices), message=message, @@ -1511,17 +1489,12 @@ class OpenAIChatCompletionToLlamaStackMixin: ) choices.append(choice) - usage = OpenAIChatCompletionUsage( - prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens - ) - return OpenAIChatCompletion( id=f"chatcmpl-{uuid.uuid4()}", choices=choices, created=int(time.time()), model=model, object="chat.completion", - usage=usage, ) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index b232f8658..04c324618 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -13,6 +13,13 @@ import pytest from ..test_cases.test_case import TestCase +@pytest.fixture(autouse=True) +def rate_limit_delay(): + """Add delay between tests to avoid rate limiting from providers like Fireworks""" + yield + time.sleep(30) # 30 second delay after each test + + def _normalize_text(text: str) -> str: """ Normalize Unicode text by removing diacritical marks for comparison. diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py index ce3d2a8ea..fce5f5821 100644 --- a/tests/integration/inference/test_openai_embeddings.py +++ b/tests/integration/inference/test_openai_embeddings.py @@ -6,6 +6,7 @@ import base64 import struct +import time import pytest from openai import OpenAI @@ -13,6 +14,13 @@ from openai import OpenAI from llama_stack.core.library_client import LlamaStackAsLibraryClient +@pytest.fixture(autouse=True) +def rate_limit_delay(): + """Add delay between tests to avoid rate limiting from providers like Fireworks""" + yield + time.sleep(30) # 30 second delay after each test + + def decode_base64_to_floats(base64_string: str) -> list[float]: """Helper function to decode base64 string to list of float32 values.""" embedding_bytes = base64.b64decode(base64_string) diff --git a/tests/integration/suites.py b/tests/integration/suites.py index e8b1b6973..f7382f5d8 100644 --- a/tests/integration/suites.py +++ b/tests/integration/suites.py @@ -112,9 +112,10 @@ SETUP_DEFINITIONS: dict[str, Setup] = { name="fireworks", description="Fireworks provider with a text model", defaults={ - "text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct", - "vision_model": "fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct", + "text_model": "accounts/fireworks/models/llama-v3p1-8b-instruct", + "vision_model": "accounts/fireworks/models/llama-v3p2-90b-vision-instruct", "embedding_model": "nomic-ai/nomic-embed-text-v1.5", + # "embedding_model": "accounts/fireworks/models/qwen3-embedding-8b", }, ), }