Merge branch 'main' into hide-non-openai-inference-apis

2025-12-10 03:30:58 +00:00 · 2025-09-26 10:48:34 -04:00 · 2025-09-26 10:48:34 -04:00 · cb534281c8
commit cb534281c8
parent 5848b3f82c c88c4ff2c6
714 changed files with 123149 additions and 54618 deletions
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -5,11 +5,26 @@
 # the root directory of this source tree.


+import time
+import unicodedata
+
 import pytest

 from ..test_cases.test_case import TestCase


+def _normalize_text(text: str) -> str:
+    """
+    Normalize Unicode text by removing diacritical marks for comparison.
+
+    The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
+    in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
+    Latin spelling. The test is failing because it's doing a simple case-insensitive string search
+    for "sol" but the actual response contains the diacritical mark.
+    """
+    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
+
+
 def provider_from_model(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -25,7 +40,6 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "inline::sentence-transformers",
        "inline::vllm",
        "remote::bedrock",
-        "remote::cerebras",
        "remote::databricks",
        # Technically Nvidia does support OpenAI completions, but none of their hosted models
        # support both completions and chat completions endpoint and all the Llama models are
@ -33,8 +47,17 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::nvidia",
        "remote::runpod",
        "remote::sambanova",
-        "remote::tgi",
        "remote::vertexai",
+        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
+        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
+        "remote::groq",
+        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
+        "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
+        "remote::azure",  # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
+        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
+        #  again. You can learn more about which models can be used with each operation here:
+        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
+        "remote::watsonx",  # return 404 when hitting the /openai/v1 endpoint
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")

@ -56,6 +79,31 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
        pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")


+def skip_if_doesnt_support_n(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "remote::sambanova",
+        "remote::ollama",
+        # https://console.groq.com/docs/openai#currently-unsupported-openai-features
+        # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
+        "remote::groq",
+        # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
+        # current model', 'status': 'INVALID_ARGUMENT'}}]
+        "remote::gemini",
+        # https://docs.anthropic.com/en/api/openai-sdk#simple-fields
+        "remote::anthropic",
+        "remote::vertexai",
+        #  Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
+        #  the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
+        "remote::tgi",  # TGI ignores n param silently
+        "remote::together",  # `n` > 1 is not supported when streaming tokens. Please disable `stream`
+        # Error code 400 - {'message': '"n" > 1 is not currently supported', 'type': 'invalid_request_error', 'param': 'n', 'code': 'wrong_api_format'}
+        "remote::cerebras",
+        "remote::databricks",  # Bad request: parameter "n" must be equal to 1 for streaming mode
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
+
+
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
@ -63,10 +111,10 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
        "inline::sentence-transformers",
        "inline::vllm",
        "remote::bedrock",
-        "remote::cerebras",
        "remote::databricks",
+        "remote::cerebras",
        "remote::runpod",
-        "remote::tgi",
+        "remote::watsonx",  # watsonx returns 404 when hitting the /openai/v1 endpoint
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")

@ -130,7 +178,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.text) > 5
-    assert "france" in choice.text.lower()
+    normalized_text = _normalize_text(choice.text)
+    assert "france" in normalized_text


@pytest.mark.parametrize(
@ -221,7 +270,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
-    assert expected.lower() in message_content
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text(message_content)
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -245,10 +296,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
    )
    streamed_content = []
    for chunk in response:
-        if chunk.choices[0].delta.content:
+        # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
+        if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text("".join(streamed_content))
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -260,10 +314,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
 )
 def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-
-    provider = provider_from_model(client_with_models, text_model_id)
-    if provider.provider_type == "remote::ollama":
-        pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
+    skip_if_doesnt_support_n(client_with_models, text_model_id)

    tc = TestCase(test_case)
    question = tc["question"]
@ -284,8 +335,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
                    streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
                )
    assert len(streamed_content) == 2
+    normalized_expected = _normalize_text(expected)
    for i, content in streamed_content.items():
-        assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
+        normalized_content = _normalize_text(content)
+        assert normalized_expected in normalized_content, (
+            f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
+        )


@pytest.mark.parametrize(
@ -315,16 +370,23 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if chunk.choices[0].delta.content:
+            if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                content += chunk.choices[0].delta.content
    else:
        response_id = response.id
        content = response.choices[0].message.content

-    responses = client.chat.completions.list(limit=1000)
-    assert response_id in [r.id for r in responses.data]
+    tries = 0
+    while tries < 10:
+        responses = client.chat.completions.list(limit=1000)
+        if response_id in [r.id for r in responses.data]:
+            break
+        else:
+            tries += 1
+            time.sleep(0.1)
+    assert tries < 10, f"Response {response_id} not found after 1 second"

    retrieved_response = client.chat.completions.retrieve(response_id)
    assert retrieved_response.id == response_id
@ -379,15 +441,28 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if delta := chunk.choices[0].delta:
-                if delta.content:
-                    content += delta.content
+            if chunk.choices and len(chunk.choices) > 0:
+                if delta := chunk.choices[0].delta:
+                    if delta.content:
+                        content += delta.content
    else:
        response_id = response.id
        content = response.choices[0].message.content

+    # wait for the response to be stored
+    tries = 0
+    while tries < 10:
+        responses = client.chat.completions.list(limit=1000)
+        if response_id in [r.id for r in responses.data]:
+            break
+        else:
+            tries += 1
+            time.sleep(0.1)
+
+    assert tries < 10, f"Response {response_id} not found after 1 second"
+
    responses = client.chat.completions.list(limit=1000)
    assert response_id in [r.id for r in responses.data]

@ -441,4 +516,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
-    assert "hello world" in message_content
+    normalized_content = _normalize_text(message_content)
+    assert "hello world" in normalized_content
--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@ -29,9 +29,40 @@ def provider_from_model(client_with_models, model_id):
    return providers[provider_id]


-def skip_if_model_doesnt_support_variable_dimensions(model_id):
-    if "text-embedding-3" not in model_id:
-        pytest.skip("{model_id} does not support variable output embedding dimensions")
+def skip_if_model_doesnt_support_user_param(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::together",  # service returns 400
+        "remote::fireworks",  # service returns 400 malformed input
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support user param.")
+
+
+def skip_if_model_doesnt_support_encoding_format_base64(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::databricks",  # param silently ignored, always returns floats
+        "remote::fireworks",  # param silently ignored, always returns list of floats
+        "remote::ollama",  # param silently ignored, always returns list of floats
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support encoding_format='base64'.")
+
+
+def skip_if_model_doesnt_support_variable_dimensions(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "remote::together",  # returns 400
+        "inline::sentence-transformers",
+        # Error code: 400 - {'error_code': 'BAD_REQUEST', 'message': 'Bad request: json: unknown field "dimensions"\n'}
+        "remote::databricks",
+    ):
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )
+    if provider.provider_type == "remote::openai" and "text-embedding-3" not in model_id:
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )


@pytest.fixture(params=["openai_client", "llama_stack_client"])
@ -47,7 +78,6 @@ def skip_if_model_doesnt_support_openai_embeddings(client, model_id):
        "inline::meta-reference",
        "remote::bedrock",
        "remote::cerebras",
-        "remote::databricks",
        "remote::runpod",
        "remote::sambanova",
        "remote::tgi",
@ -92,6 +122,7 @@ def test_openai_embeddings_multiple_strings(compat_client, client_with_models, e
    response = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_texts,
+        encoding_format="float",
    )

    assert response.object == "list"
@ -127,7 +158,7 @@ def test_openai_embeddings_with_encoding_format_float(compat_client, client_with
 def test_openai_embeddings_with_dimensions(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with custom dimensions parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test dimensions parameter"
    dimensions = 16
@ -148,6 +179,7 @@ def test_openai_embeddings_with_dimensions(compat_client, client_with_models, em
 def test_openai_embeddings_with_user_parameter(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with user parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_user_param(client_with_models, embedding_model_id)

    input_text = "Test user parameter"
    user_id = "test-user-123"
@ -196,11 +228,13 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
    response1 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text1,
+        encoding_format="float",
    )

    response2 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text2,
+        encoding_format="float",
    )

    embedding1 = response1.data[0].embedding
@ -214,7 +248,8 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
 def test_openai_embeddings_with_encoding_format_base64(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding format."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test base64 encoding format"
    dimensions = 12
@ -247,6 +282,7 @@ def test_openai_embeddings_with_encoding_format_base64(compat_client, client_wit
 def test_openai_embeddings_base64_batch_processing(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)

    input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]

@ -255,7 +291,6 @@ def test_openai_embeddings_base64_batch_processing(compat_client, client_with_mo
        input=input_texts,
        encoding_format="base64",
    )
-
    # Validate response structure
    assert response.object == "list"
    assert response.model == embedding_model_id