diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 1c356d1f1..88099acd0 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -590,7 +590,6 @@ class InferenceRouter(Inference): async def _nonstream_openai_chat_completion(self, provider: Inference, params: dict) -> OpenAIChatCompletion: response = await provider.openai_chat_completion(**params) - for choice in response.choices: # some providers return an empty list for no tool calls in non-streaming responses # but the OpenAI API returns None. So, set tool_calls to None if it's empty diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py index 444b8bf04..30807a0d4 100644 --- a/llama_stack/providers/remote/inference/fireworks/models.py +++ b/llama_stack/providers/remote/inference/fireworks/models.py @@ -61,7 +61,6 @@ MODEL_ENTRIES = [ ), ProviderModelEntry( provider_model_id="nomic-ai/nomic-embed-text-v1.5", - aliases=["nomic-ai/nomic-embed-text-v1.5"], model_type=ModelType.embedding, metadata={ "embedding_dimension": 768, diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 04c324618..b232f8658 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -13,13 +13,6 @@ import pytest from ..test_cases.test_case import TestCase -@pytest.fixture(autouse=True) -def rate_limit_delay(): - """Add delay between tests to avoid rate limiting from providers like Fireworks""" - yield - time.sleep(30) # 30 second delay after each test - - def _normalize_text(text: str) -> str: """ Normalize Unicode text by removing diacritical marks for comparison. diff --git a/tests/integration/inference/test_openai_embeddings.py b/tests/integration/inference/test_openai_embeddings.py index fce5f5821..ce3d2a8ea 100644 --- a/tests/integration/inference/test_openai_embeddings.py +++ b/tests/integration/inference/test_openai_embeddings.py @@ -6,7 +6,6 @@ import base64 import struct -import time import pytest from openai import OpenAI @@ -14,13 +13,6 @@ from openai import OpenAI from llama_stack.core.library_client import LlamaStackAsLibraryClient -@pytest.fixture(autouse=True) -def rate_limit_delay(): - """Add delay between tests to avoid rate limiting from providers like Fireworks""" - yield - time.sleep(30) # 30 second delay after each test - - def decode_base64_to_floats(base64_string: str) -> list[float]: """Helper function to decode base64 string to list of float32 values.""" embedding_bytes = base64.b64decode(base64_string) diff --git a/tests/integration/suites.py b/tests/integration/suites.py index f7382f5d8..fb2c44308 100644 --- a/tests/integration/suites.py +++ b/tests/integration/suites.py @@ -115,7 +115,6 @@ SETUP_DEFINITIONS: dict[str, Setup] = { "text_model": "accounts/fireworks/models/llama-v3p1-8b-instruct", "vision_model": "accounts/fireworks/models/llama-v3p2-90b-vision-instruct", "embedding_model": "nomic-ai/nomic-embed-text-v1.5", - # "embedding_model": "accounts/fireworks/models/qwen3-embedding-8b", }, ), }