feat: add Azure OpenAI inference provider support

Llama-stack now supports a new OpenAI compatible endpoint with Azure OpenAI. The starter distro has been updated to add the new remote inference provider. A few tests have been modified and improved. Tests plan: Deploy a model in the Aure portal then: ``` $ AZURE_API_KEY=... AZURE_API_BASE=... uv run llama stack build --image-type venv --providers inference=remote::azure --run ... $ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model azure/gpt-4.1 tests/integration/inference/test_openai_completion.py ... Results: ``` ============================================= test session starts ============================================== platform darwin -- Python 3.12.8, pytest-8.4.1, pluggy-1.6.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.12.8', 'Platform': 'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1', 'pluggy': '1.6.0'}, 'Plugins': {'anyio': '4.9.0', 'html': '4.1.1', 'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0', 'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval': '0.11.0', 'hydra-core': '1.3.2'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0, json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1, nbval-0.11.0, hydra-core-1.3.2 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 27 items tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 3%] tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=azure/gpt-5-mini-inference:completion:suffix] SKIPPED [ 7%] tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 11%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-1] SKIPPED [ 14%] tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=azure/gpt-5-mini] SKIPPED [ 18%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 22%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 25%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 29%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 33%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 37%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=azure/gpt-5-mini] SKIPPEDed files.) [ 40%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-0] SKIPPED [ 44%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 48%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 51%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 55%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 59%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 62%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 66%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 70%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 74%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 77%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 81%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 85%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 88%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 92%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-False] PASSED [ 96%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-False] PASSED [100%] =========================================== short test summary info ============================================ SKIPPED [3] tests/integration/inference/test_openai_completion.py:63: Model azure/gpt-5-mini hosted by remote::azure doesn't support OpenAI completions. SKIPPED [3] tests/integration/inference/test_openai_completion.py:118: Model azure/gpt-5-mini hosted by remote::azure doesn't support vllm extra_body parameters. SKIPPED [1] tests/integration/inference/test_openai_completion.py:124: Model azure/gpt-5-mini hosted by remote::azure doesn't support chat completion calls with base64 encoded files. ================================== 20 passed, 7 skipped, 2 warnings in 51.77s ================================== ``` Signed-off-by: Sébastien Han <seb@redhat.com>
2025-10-04 12:07:34 +00:00 · 2025-09-01 16:41:30 +02:00 · 2025-09-01 16:41:30 +02:00 · ee3df99de4
commit ee3df99de4
parent 8e05c68d15
26 changed files with 6403 additions and 13 deletions
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -6,12 +6,25 @@


 import time
+import unicodedata

 import pytest

 from ..test_cases.test_case import TestCase


+def _normalize_text(text: str) -> str:
+    """
+    Normalize Unicode text by removing diacritical marks for comparison.
+
+    The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
+    in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
+    Latin spelling. The test is failing because it's doing a simple case-insensitive string search
+    for "sol" but the actual response contains the diacritical mark.
+    """
+    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
+
+
 def provider_from_model(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
        "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
+        "remote::azure",  # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
+        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
+        #  again. You can learn more about which models can be used with each operation here:
+        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")

@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.text) > 5
-    assert "france" in choice.text.lower()
+    normalized_text = _normalize_text(choice.text)
+    assert "france" in normalized_text


@pytest.mark.parametrize(
@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
-    assert expected.lower() in message_content
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text(message_content)
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
    )
    streamed_content = []
    for chunk in response:
-        if chunk.choices[0].delta.content:
+        # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
+        if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text("".join(streamed_content))
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
                    streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
                )
    assert len(streamed_content) == 2
+    normalized_expected = _normalize_text(expected)
    for i, content in streamed_content.items():
-        assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
+        normalized_content = _normalize_text(content)
+        assert normalized_expected in normalized_content, (
+            f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
+        )


@pytest.mark.parametrize(
@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if chunk.choices[0].delta.content:
+            if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                content += chunk.choices[0].delta.content
    else:
        response_id = response.id
@ -410,11 +437,12 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if delta := chunk.choices[0].delta:
-                if delta.content:
-                    content += delta.content
+            if chunk.choices and len(chunk.choices) > 0:
+                if delta := chunk.choices[0].delta:
+                    if delta.content:
+                        content += delta.content
    else:
        response_id = response.id
        content = response.choices[0].message.content
@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
-    assert "hello world" in message_content
+    normalized_content = _normalize_text(message_content)
+    assert "hello world" in normalized_content