diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index fd782f6c9..9ddb070d7 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6372,9 +6372,6 @@
                             "$ref": "#/components/schemas/TokenLogProbs"
                         },
                         "description": "Optional log probabilities for generated tokens"
-                    },
-                    "usage": {
-                        "$ref": "#/components/schemas/UsageInfo"
                     }
                 },
                 "additionalProperties": false,
@@ -6433,31 +6430,6 @@
                 "title": "TokenLogProbs",
                 "description": "Log probabilities for generated tokens."
             },
-            "UsageInfo": {
-                "type": "object",
-                "properties": {
-                    "completion_tokens": {
-                        "type": "integer",
-                        "description": "Number of tokens generated"
-                    },
-                    "prompt_tokens": {
-                        "type": "integer",
-                        "description": "Number of tokens in the prompt"
-                    },
-                    "total_tokens": {
-                        "type": "integer",
-                        "description": "Total number of tokens processed"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_tokens",
-                    "prompt_tokens",
-                    "total_tokens"
-                ],
-                "title": "UsageInfo",
-                "description": "Usage information for a model."
-            },
             "BatchCompletionRequest": {
                 "type": "object",
                 "properties": {
@@ -10967,31 +10939,6 @@
                 "title": "OpenAIChatCompletionToolCallFunction",
                 "description": "Function call details for OpenAI-compatible tool calls."
             },
-            "OpenAIChatCompletionUsage": {
-                "type": "object",
-                "properties": {
-                    "prompt_tokens": {
-                        "type": "integer",
-                        "description": "The number of tokens in the prompt"
-                    },
-                    "completion_tokens": {
-                        "type": "integer",
-                        "description": "The number of tokens in the completion"
-                    },
-                    "total_tokens": {
-                        "type": "integer",
-                        "description": "The total number of tokens used"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "prompt_tokens",
-                    "completion_tokens",
-                    "total_tokens"
-                ],
-                "title": "OpenAIChatCompletionUsage",
-                "description": "Usage information for an OpenAI-compatible chat completion response."
-            },
             "OpenAIChoice": {
                 "type": "object",
                 "properties": {
@@ -11329,13 +11276,6 @@
             "OpenAICompletionWithInputMessages": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
                     "id": {
                         "type": "string",
                         "description": "The ID of the chat completion"
@@ -11361,9 +11301,6 @@
                         "type": "string",
                         "description": "The model that was used to generate the chat completion"
                     },
-                    "usage": {
-                        "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
-                    },
                     "input_messages": {
                         "type": "array",
                         "items": {
@@ -13125,13 +13062,6 @@
                         "items": {
                             "type": "object",
                             "properties": {
-                                "metrics": {
-                                    "type": "array",
-                                    "items": {
-                                        "$ref": "#/components/schemas/MetricInResponse"
-                                    },
-                                    "description": "(Optional) List of metrics associated with the API response"
-                                },
                                 "id": {
                                     "type": "string",
                                     "description": "The ID of the chat completion"
@@ -13157,9 +13087,6 @@
                                     "type": "string",
                                     "description": "The model that was used to generate the chat completion"
                                 },
-                                "usage": {
-                                    "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
-                                },
                                 "input_messages": {
                                     "type": "array",
                                     "items": {
@@ -14551,13 +14478,6 @@
             "OpenAIChatCompletion": {
                 "type": "object",
                 "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
                     "id": {
                         "type": "string",
                         "description": "The ID of the chat completion"
@@ -14582,9 +14502,6 @@
                     "model": {
                         "type": "string",
                         "description": "The model that was used to generate the chat completion"
-                    },
-                    "usage": {
-                        "$ref": "#/components/schemas/OpenAIChatCompletionUsage"
                     }
                 },
                 "additionalProperties": false,
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index d0096e268..94dc5c0f9 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -4548,8 +4548,6 @@ components:
             $ref: '#/components/schemas/TokenLogProbs'
           description: >-
             Optional log probabilities for generated tokens
-        usage:
-          $ref: '#/components/schemas/UsageInfo'
       additionalProperties: false
       required:
         - completion_message
@@ -4591,25 +4589,6 @@ components:
         - logprobs_by_token
       title: TokenLogProbs
       description: Log probabilities for generated tokens.
-    UsageInfo:
-      type: object
-      properties:
-        completion_tokens:
-          type: integer
-          description: Number of tokens generated
-        prompt_tokens:
-          type: integer
-          description: Number of tokens in the prompt
-        total_tokens:
-          type: integer
-          description: Total number of tokens processed
-      additionalProperties: false
-      required:
-        - completion_tokens
-        - prompt_tokens
-        - total_tokens
-      title: UsageInfo
-      description: Usage information for a model.
     BatchCompletionRequest:
       type: object
       properties:
@@ -8124,26 +8103,6 @@ components:
       title: OpenAIChatCompletionToolCallFunction
       description: >-
         Function call details for OpenAI-compatible tool calls.
-    OpenAIChatCompletionUsage:
-      type: object
-      properties:
-        prompt_tokens:
-          type: integer
-          description: The number of tokens in the prompt
-        completion_tokens:
-          type: integer
-          description: The number of tokens in the completion
-        total_tokens:
-          type: integer
-          description: The total number of tokens used
-      additionalProperties: false
-      required:
-        - prompt_tokens
-        - completion_tokens
-        - total_tokens
-      title: OpenAIChatCompletionUsage
-      description: >-
-        Usage information for an OpenAI-compatible chat completion response.
     OpenAIChoice:
       type: object
       properties:
@@ -8406,12 +8365,6 @@ components:
     OpenAICompletionWithInputMessages:
       type: object
       properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
         id:
           type: string
           description: The ID of the chat completion
@@ -8434,8 +8387,6 @@ components:
           type: string
           description: >-
             The model that was used to generate the chat completion
-        usage:
-          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
         input_messages:
           type: array
           items:
@@ -9731,12 +9682,6 @@ components:
           items:
             type: object
             properties:
-              metrics:
-                type: array
-                items:
-                  $ref: '#/components/schemas/MetricInResponse'
-                description: >-
-                  (Optional) List of metrics associated with the API response
               id:
                 type: string
                 description: The ID of the chat completion
@@ -9759,8 +9704,6 @@ components:
                 type: string
                 description: >-
                   The model that was used to generate the chat completion
-              usage:
-                $ref: '#/components/schemas/OpenAIChatCompletionUsage'
               input_messages:
                 type: array
                 items:
@@ -10776,12 +10719,6 @@ components:
     OpenAIChatCompletion:
       type: object
       properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
         id:
           type: string
           description: The ID of the chat completion
@@ -10804,8 +10741,6 @@ components:
           type: string
           description: >-
             The model that was used to generate the chat completion
-        usage:
-          $ref: '#/components/schemas/OpenAIChatCompletionUsage'
       additionalProperties: false
       required:
         - id
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 1b7869a30..bd4737ca7 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -451,20 +451,6 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
     event: ChatCompletionResponseEvent
 
 
-@json_schema_type
-class UsageInfo(BaseModel):
-    """Usage information for a model.
-
-    :param completion_tokens: Number of tokens generated
-    :param prompt_tokens: Number of tokens in the prompt
-    :param total_tokens: Total number of tokens processed
-    """
-
-    completion_tokens: int
-    prompt_tokens: int
-    total_tokens: int
-
-
 @json_schema_type
 class ChatCompletionResponse(MetricResponseMixin):
     """Response from a chat completion request.
@@ -475,7 +461,6 @@ class ChatCompletionResponse(MetricResponseMixin):
 
     completion_message: CompletionMessage
     logprobs: list[TokenLogProbs] | None = None
-    usage: UsageInfo | None = None
 
 
 @json_schema_type
@@ -833,21 +818,7 @@ class OpenAIChoice(BaseModel):
 
 
 @json_schema_type
-class OpenAIChatCompletionUsage(BaseModel):
-    """Usage information for an OpenAI-compatible chat completion response.
-
-    :param prompt_tokens: The number of tokens in the prompt
-    :param completion_tokens: The number of tokens in the completion
-    :param total_tokens: The total number of tokens used
-    """
-
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-
-@json_schema_type
-class OpenAIChatCompletion(MetricResponseMixin):
+class OpenAIChatCompletion(BaseModel):
     """Response from an OpenAI-compatible chat completion request.
 
     :param id: The ID of the chat completion
@@ -862,7 +833,6 @@ class OpenAIChatCompletion(MetricResponseMixin):
     object: Literal["chat.completion"] = "chat.completion"
     created: int
     model: str
-    usage: OpenAIChatCompletionUsage | None = None
 
 
 @json_schema_type
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 00f3f5418..f4d37d558 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncGenerator
 from typing import Any
 
 from fireworks.client import Fireworks
@@ -23,11 +23,7 @@ from llama_stack.apis.inference import (
     Inference,
     LogProbConfig,
     Message,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
     OpenAICompletion,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
     ResponseFormat,
     ResponseFormatType,
     SamplingParams,
@@ -43,7 +39,6 @@ from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
     convert_message_to_openai_dict,
     get_sampling_options,
     process_chat_completion_response,
@@ -335,90 +330,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
             prompt_logprobs=prompt_logprobs,
             suffix=suffix,
         )
-
-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_obj = await self.model_store.get_model(model)
-
-        # Divert Llama Models through Llama Stack inference APIs because
-        # Fireworks chat completions OpenAI-compatible API does not support
-        # tool calls properly.
-        llama_model = self.get_llama_model(model_obj.provider_resource_id)
-
-        if llama_model:
-            return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(
-                self,
-                model=model,
-                messages=messages,
-                frequency_penalty=frequency_penalty,
-                function_call=function_call,
-                functions=functions,
-                logit_bias=logit_bias,
-                logprobs=logprobs,
-                max_completion_tokens=max_completion_tokens,
-                max_tokens=max_tokens,
-                n=n,
-                parallel_tool_calls=parallel_tool_calls,
-                presence_penalty=presence_penalty,
-                response_format=response_format,
-                seed=seed,
-                stop=stop,
-                stream=stream,
-                stream_options=stream_options,
-                temperature=temperature,
-                tool_choice=tool_choice,
-                tools=tools,
-                top_logprobs=top_logprobs,
-                top_p=top_p,
-                user=user,
-            )
-
-        return await super().openai_chat_completion(
-            model=model,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py
index 30807a0d4..444b8bf04 100644
--- a/llama_stack/providers/remote/inference/fireworks/models.py
+++ b/llama_stack/providers/remote/inference/fireworks/models.py
@@ -61,6 +61,7 @@ MODEL_ENTRIES = [
     ),
     ProviderModelEntry(
         provider_model_id="nomic-ai/nomic-embed-text-v1.5",
+        aliases=["nomic-ai/nomic-embed-text-v1.5"],
         model_type=ModelType.embedding,
         metadata={
             "embedding_dimension": 768,
diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py
index 3ef4fb134..55c2ac0ad 100644
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@@ -31,8 +31,6 @@ from openai.types.chat import (
     ChatCompletionContentPartTextParam as OpenAIChatCompletionContentPartTextParam,
 )
 
-from llama_stack.apis.inference.inference import UsageInfo
-
 try:
     from openai.types.chat import (
         ChatCompletionMessageFunctionToolCall as OpenAIChatCompletionMessageFunctionToolCall,
@@ -105,7 +103,6 @@ from llama_stack.apis.inference import (
     JsonSchemaResponseFormat,
     Message,
     OpenAIChatCompletion,
-    OpenAIChatCompletionUsage,
     OpenAICompletion,
     OpenAICompletionChoice,
     OpenAIEmbeddingData,
@@ -280,11 +277,6 @@ def process_chat_completion_response(
     request: ChatCompletionRequest,
 ) -> ChatCompletionResponse:
     choice = response.choices[0]
-    usage = UsageInfo(
-        prompt_tokens=response.usage.prompt_tokens,
-        completion_tokens=response.usage.completion_tokens,
-        total_tokens=response.usage.total_tokens,
-    )
     if choice.finish_reason == "tool_calls":
         if not choice.message or not choice.message.tool_calls:
             raise ValueError("Tool calls are not present in the response")
@@ -298,7 +290,6 @@ def process_chat_completion_response(
                     content=json.dumps(tool_calls, default=lambda x: x.model_dump()),
                 ),
                 logprobs=None,
-                usage=usage,
             )
         else:
             # Otherwise, return tool calls as normal
@@ -310,7 +301,6 @@ def process_chat_completion_response(
                     content="",
                 ),
                 logprobs=None,
-                usage=usage,
             )
 
     # TODO: This does not work well with tool calls for vLLM remote provider
@@ -345,7 +335,6 @@ def process_chat_completion_response(
             tool_calls=raw_message.tool_calls,
         ),
         logprobs=None,
-        usage=usage,
     )
 
 
@@ -657,7 +646,7 @@ async def convert_message_to_openai_dict_new(
                     arguments=json.dumps(tool.arguments),
                 ),
                 type="function",
-            ).model_dump()
+            )
             for tool in message.tool_calls
         ]
         params = {}
@@ -668,7 +657,6 @@ async def convert_message_to_openai_dict_new(
             content=await _convert_message_content(message.content),
             **params,
         )
-
     elif isinstance(message, ToolResponseMessage):
         out = OpenAIChatCompletionToolMessage(
             role="tool",
@@ -1387,7 +1375,6 @@ class OpenAIChatCompletionToLlamaStackMixin:
         user: str | None = None,
     ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
         messages = openai_messages_to_messages(messages)
-
         response_format = _convert_openai_request_response_format(response_format)
         sampling_params = _convert_openai_sampling_params(
             max_tokens=max_tokens,
@@ -1414,6 +1401,7 @@ class OpenAIChatCompletionToLlamaStackMixin:
                 tools=tools,
             )
             outstanding_responses.append(response)
+
         if stream:
             return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
 
@@ -1488,22 +1476,12 @@ class OpenAIChatCompletionToLlamaStackMixin:
         self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
     ) -> OpenAIChatCompletion:
         choices = []
-        total_prompt_tokens = 0
-        total_completion_tokens = 0
-        total_tokens = 0
-
         for outstanding_response in outstanding_responses:
             response = await outstanding_response
             completion_message = response.completion_message
             message = await convert_message_to_openai_dict_new(completion_message)
             finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
 
-            # Aggregate usage data
-            if response.usage:
-                total_prompt_tokens += response.usage.prompt_tokens
-                total_completion_tokens += response.usage.completion_tokens
-                total_tokens += response.usage.total_tokens
-
             choice = OpenAIChatCompletionChoice(
                 index=len(choices),
                 message=message,
@@ -1511,17 +1489,12 @@ class OpenAIChatCompletionToLlamaStackMixin:
             )
             choices.append(choice)
 
-        usage = OpenAIChatCompletionUsage(
-            prompt_tokens=total_prompt_tokens, completion_tokens=total_completion_tokens, total_tokens=total_tokens
-        )
-
         return OpenAIChatCompletion(
             id=f"chatcmpl-{uuid.uuid4()}",
             choices=choices,
             created=int(time.time()),
             model=model,
             object="chat.completion",
-            usage=usage,
         )
 
 
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index b232f8658..04c324618 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -13,6 +13,13 @@ import pytest
 from ..test_cases.test_case import TestCase
 
 
+@pytest.fixture(autouse=True)
+def rate_limit_delay():
+    """Add delay between tests to avoid rate limiting from providers like Fireworks"""
+    yield
+    time.sleep(30)  # 30 second delay after each test
+
+
 def _normalize_text(text: str) -> str:
     """
     Normalize Unicode text by removing diacritical marks for comparison.
diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py
index ce3d2a8ea..fce5f5821 100644
--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@@ -6,6 +6,7 @@
 
 import base64
 import struct
+import time
 
 import pytest
 from openai import OpenAI
@@ -13,6 +14,13 @@ from openai import OpenAI
 from llama_stack.core.library_client import LlamaStackAsLibraryClient
 
 
+@pytest.fixture(autouse=True)
+def rate_limit_delay():
+    """Add delay between tests to avoid rate limiting from providers like Fireworks"""
+    yield
+    time.sleep(30)  # 30 second delay after each test
+
+
 def decode_base64_to_floats(base64_string: str) -> list[float]:
     """Helper function to decode base64 string to list of float32 values."""
     embedding_bytes = base64.b64decode(base64_string)
diff --git a/tests/integration/suites.py b/tests/integration/suites.py
index e8b1b6973..f7382f5d8 100644
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@@ -112,9 +112,10 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
         name="fireworks",
         description="Fireworks provider with a text model",
         defaults={
-            "text_model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
-            "vision_model": "fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
+            "text_model": "accounts/fireworks/models/llama-v3p1-8b-instruct",
+            "vision_model": "accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
             "embedding_model": "nomic-ai/nomic-embed-text-v1.5",
+            # "embedding_model": "accounts/fireworks/models/qwen3-embedding-8b",
         },
     ),
 }