feat: add Azure OpenAI inference provider support (#3396)

# What does this PR do?

Llama-stack now supports a new OpenAI compatible endpoint with Azure
OpenAI. The starter distro has been updated to add the new remote
inference provider.

A few tests have been modified and improved.

## Test Plan

Deploy a model in the Aure portal then:

```
$ AZURE_API_KEY=... AZURE_API_BASE=... uv run llama stack build --image-type venv --providers inference=remote::azure --run
...
$ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model azure/gpt-4.1 tests/integration/inference/test_openai_completion.py
...

Results:

```
============================================= test session starts
============================================== platform darwin -- Python
3.12.8, pytest-8.4.1, pluggy-1.6.0 --
/Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir:
.pytest_cache
metadata: {'Python': '3.12.8', 'Platform':
'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1',
'pluggy': '1.6.0'}, 'Plugins': {'anyio': '4.9.0', 'html': '4.1.1',
'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0',
'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval':
'0.11.0', 'hydra-core': '1.3.2'}} rootdir:
/Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0,
json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1,
nbval-0.11.0, hydra-core-1.3.2 asyncio: mode=Mode.AUTO,
asyncio_default_fixture_loop_scope=None,
asyncio_default_test_loop_scope=function collected 27 items


tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=azure/gpt-5-mini-inference:completion:sanity]
SKIPPED [ 3%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=azure/gpt-5-mini-inference:completion:suffix]
SKIPPED [ 7%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=azure/gpt-5-mini-inference:completion:sanity]
SKIPPED [ 11%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-1]
SKIPPED [ 14%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=azure/gpt-5-mini]
SKIPPED [ 18%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01]
PASSED [ 22%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 25%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 29%]
tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-True]
PASSED [ 33%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-True]
PASSED [ 37%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=azure/gpt-5-mini]
SKIPPEDed files.) [ 40%]
tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-0]
SKIPPED [ 44%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02]
PASSED [ 48%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 51%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 55%]
tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-False]
PASSED [ 59%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-False]
PASSED [ 62%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01]
PASSED [ 66%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 70%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01]
PASSED [ 74%]
tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-True]
PASSED [ 77%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-True]
PASSED [ 81%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02]
PASSED [ 85%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 88%]
tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02]
PASSED [ 92%]
tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-False]
PASSED [ 96%]
tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-False]
PASSED [100%]

=========================================== short test summary info
============================================ SKIPPED [3]
tests/integration/inference/test_openai_completion.py:63: Model
azure/gpt-5-mini hosted by remote::azure doesn't support OpenAI
completions. SKIPPED [3]
tests/integration/inference/test_openai_completion.py:118: Model
azure/gpt-5-mini hosted by remote::azure doesn't support vllm extra_body
parameters. SKIPPED [1]
tests/integration/inference/test_openai_completion.py:124: Model
azure/gpt-5-mini hosted by remote::azure doesn't support chat completion
calls with base64 encoded files. ================================== 20
passed, 7 skipped, 2 warnings in 51.77s
==================================
```

Signed-off-by: Sébastien Han <seb@redhat.com>
This commit is contained in:
Sébastien Han 2025-09-11 13:48:38 +02:00 committed by GitHub
parent c2d281e01b
commit f31bcc11bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 6403 additions and 13 deletions

View file

@ -6,12 +6,25 @@
import time
import unicodedata
import pytest
from ..test_cases.test_case import TestCase
def _normalize_text(text: str) -> str:
"""
Normalize Unicode text by removing diacritical marks for comparison.
The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
Latin spelling. The test is failing because it's doing a simple case-insensitive string search
for "sol" but the actual response contains the diacritical mark.
"""
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
def provider_from_model(client_with_models, model_id):
models = {m.identifier: m for m in client_with_models.models.list()}
models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
"remote::groq",
"remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
"remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
"remote::azure", # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
# does not work with the specified model, gpt-5-mini. Please choose different model and try
# again. You can learn more about which models can be used with each operation here:
# https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.text) > 5
assert "france" in choice.text.lower()
normalized_text = _normalize_text(choice.text)
assert "france" in normalized_text
@pytest.mark.parametrize(
@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
)
message_content = response.choices[0].message.content.lower().strip()
assert len(message_content) > 0
assert expected.lower() in message_content
normalized_expected = _normalize_text(expected)
normalized_content = _normalize_text(message_content)
assert normalized_expected in normalized_content
@pytest.mark.parametrize(
@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
)
streamed_content = []
for chunk in response:
if chunk.choices[0].delta.content:
# On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
streamed_content.append(chunk.choices[0].delta.content.lower().strip())
assert len(streamed_content) > 0
assert expected.lower() in "".join(streamed_content)
normalized_expected = _normalize_text(expected)
normalized_content = _normalize_text("".join(streamed_content))
assert normalized_expected in normalized_content
@pytest.mark.parametrize(
@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
)
assert len(streamed_content) == 2
normalized_expected = _normalize_text(expected)
for i, content in streamed_content.items():
assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
normalized_content = _normalize_text(content)
assert normalized_expected in normalized_content, (
f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
)
@pytest.mark.parametrize(
@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
content = ""
response_id = None
for chunk in response:
if response_id is None:
if response_id is None and chunk.id:
response_id = chunk.id
if chunk.choices[0].delta.content:
if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
content += chunk.choices[0].delta.content
else:
response_id = response.id
@ -410,11 +437,12 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
content = ""
response_id = None
for chunk in response:
if response_id is None:
if response_id is None and chunk.id:
response_id = chunk.id
if delta := chunk.choices[0].delta:
if delta.content:
content += delta.content
if chunk.choices and len(chunk.choices) > 0:
if delta := chunk.choices[0].delta:
if delta.content:
content += delta.content
else:
response_id = response.id
content = response.choices[0].message.content
@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
stream=False,
)
message_content = response.choices[0].message.content.lower().strip()
assert "hello world" in message_content
normalized_content = _normalize_text(message_content)
assert "hello world" in normalized_content

View file

@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id):
"remote::vertexai",
"remote::groq",
"remote::sambanova",
"remote::azure",
)
or "openai-compat" in provider.provider_type
):
@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
provider_id = models[model_id].provider_id
providers = {p.provider_id: p for p in client_with_models.providers.list()}
provider = providers[provider_id]
if provider.provider_type in ("remote::sambanova",):
if provider.provider_type in ("remote::sambanova", "remote::azure"):
pytest.skip(
f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
)