feat: azure ai inference support

2025-12-13 13:02:38 +00:00 · 2024-11-04 06:41:15 +00:00 · 2024-11-04 06:41:15 +00:00 · 27a0545f5f
commit 27a0545f5f
parent bf4f97a2e1
5 changed files with 317 additions and 0 deletions
--- a/llama_stack/providers/adapters/inference/azure_ai_inference/init.py
+++ b/llama_stack/providers/adapters/inference/azure_ai_inference/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from .azure_ai_inference import AzureAIInferenceAdapter
+from .config import AzureAIInferenceConfig
+
+
+async def get_adapter_impl(config: AzureAIInferenceConfig, _deps):
+    assert isinstance(config, AzureAIInferenceConfig), f"Unexpected config type: {type(config)}"
+
+    impl = AzureAIInferenceAdapter(config)
+
+    await impl.initialize()
+
+    return impl
--- a/llama_stack/providers/adapters/inference/azure_ai_inference/azure_ai_inference.py
+++ b/llama_stack/providers/adapters/inference/azure_ai_inference/azure_ai_inference.py
@ -0,0 +1,259 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+from typing import AsyncGenerator
+
+from llama_models.llama3.api.chat_format import ChatFormat
+
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from azure.ai.inference.aio import ChatCompletionsClient as ChatCompletionsClientAsync
+from azure.core.credentials import AzureKeyCredential
+from azure.core.exceptions import HttpResponseError
+from azure.identity import DefaultAzureCredential
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.datatypes import ModelsProtocolPrivate
+
+from llama_stack.providers.utils.inference.openai_compat import (
+    process_chat_completion_response,
+    process_chat_completion_stream_response,
+)
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    chat_completion_request_to_messages,
+)
+
+from .config import AzureAIInferenceConfig
+
+# Mapping of model names from the Llama model names to the Azure AI model catalog names
+SUPPORTED_INSTRUCT_MODELS = {
+    "Llama3.1-8B-Instruct": "Meta-Llama-3.1-8B-Instruct",
+    "Llama3.1-70B-Instruct": "Meta-Llama-3.1-70B-Instruct",
+    "Llama3.1-405B-Instruct": "Meta-Llama-3.1-405B-Instruct",
+    "Llama3.2-1B-Instruct": "Llama-3.2-1B-Instruct",
+    "Llama3.2-3B-Instruct": "Llama-3.2-3B-Instruct",
+    "Llama3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct",
+    "Llama3.2-90B-Vision-Instruct": "Llama-3.2-90B-Vision-Instruct",
+}
+
+logger = logging.getLogger(__name__)
+
+class AzureAIInferenceAdapter(Inference, ModelsProtocolPrivate):
+    def __init__(self, config: AzureAIInferenceConfig) -> None:
+        tokenizer = Tokenizer.get_instance()
+
+        self.config = config
+        self.formatter = ChatFormat(tokenizer)
+        self._model_name = None
+        
+    @property
+    def client(self) -> ChatCompletionsClientAsync:
+        if self.config.credential is None:
+            credential = DefaultAzureCredential()
+        else:
+            credential = AzureKeyCredential(self.config.credential)
+
+        if self.config.api_version:
+            return ChatCompletionsClientAsync(
+                endpoint=self.config.endpoint, 
+                credential=credential,
+                user_agent="llama-stack",
+                api_version=self.config.api_version,
+            )
+        else:
+            return ChatCompletionsClientAsync(
+                endpoint=self.config.endpoint, 
+                credential=credential,
+                user_agent="llama-stack",
+            )
+
+    async def initialize(self) -> None:
+        async with self.client as async_client:
+            try:
+                model_info = await async_client.get_model_info()
+                if model_info:
+                    self._model_name = model_info.get("model_name", None)
+                    logger.info(
+                        f"Endpoint {self.config.endpoint} supports model {self._model_name}"
+                    )
+                if self._model_name not in SUPPORTED_INSTRUCT_MODELS.values():
+                    logger.warning(
+                        f"Endpoints serves model {self._model_name} which may not be supported"
+                    )
+            except HttpResponseError:
+                logger.info(
+                    f"Endpoint {self.config.endpoint} supports multiple models"
+                )
+                self._model_name = None
+
+
+    async def shutdown(self) -> None:
+        pass
+
+
+    async def list_models(self) -> List[ModelDef]:
+        print("Model name: ", self._model_name)
+        if self._model_name is None:
+            return [
+                ModelDef(identifier=model_name, llama_model=azure_model_id)
+                for model_name, azure_model_id in SUPPORTED_INSTRUCT_MODELS.items()
+            ]
+        else:
+            # find if there is a value in the SUPPORTED_INSTRUCT_MODELS that matches the model name
+            supported_model = next(
+                (model for model in SUPPORTED_INSTRUCT_MODELS if SUPPORTED_INSTRUCT_MODELS[model] == self._model_name), 
+                None
+            )
+            return [
+                ModelDef(
+                    identifier=supported_model or self._model_name,
+                    llama_model=self._model_name
+                )
+            ]
+
+
+    async def completion(
+        self,
+        model: str,
+        content: InterleavedTextMedia,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        raise NotImplementedError()
+
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model or self.config.model_name,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+        params = self._get_params(request)
+        if stream:
+            return self._stream_chat_completion(params)
+        else:
+            return await self._nonstream_chat_completion(params)
+
+
+    async def _nonstream_chat_completion(
+        self, params: dict
+    ) -> ChatCompletionResponse:
+        async with self.client as client:
+            r = await client.complete(**params)
+            return process_chat_completion_response(r, self.formatter)
+
+
+    async def _stream_chat_completion(
+        self, params: dict
+    ) -> AsyncGenerator:
+        async with self.client as client:
+            stream = await client.complete(**params, stream=True)
+            async for chunk in process_chat_completion_stream_response(
+                stream, self.formatter
+            ):
+                yield chunk
+
+
+    @staticmethod
+    def _get_sampling_options(
+        params: SamplingParams, 
+        logprobs: Optional[LogProbConfig] = None
+    ) -> dict:
+        options = {}
+        model_extras = {}
+        if params:
+            # repetition_penalty is not supported by Azure AI inference API
+            for attr in {"temperature", "top_p", "max_tokens"}:
+                if getattr(params, attr):
+                    options[attr] = getattr(params, attr)
+
+            if params.top_k is not None and params.top_k != 0:
+                model_extras["top_k"] = params.top_k
+
+            if logprobs is not None:
+                model_extras["logprobs"] = params.logprobs
+
+            if model_extras:
+                options["model_extras"] = model_extras
+
+        return options
+
+    @staticmethod
+    def _to_azure_ai_messages(messages: List[Message]) -> List[dict]:
+        """
+        Convert the messages to the format expected by the Azure AI API.
+        """
+        azure_ai_messages = []
+        for message in messages:
+            role = message.role
+            content = message.content
+
+            if role == "user":
+                azure_ai_messages.append({"role": role, "content": content})
+            elif role == "assistant":
+                azure_ai_messages.append({"role": role, "content": content, "tool_calls": message.tool_calls})
+            elif role == "system":
+                azure_ai_messages.append({"role": role, "content": content})
+            elif role == "ipython":
+                azure_ai_messages.append(
+                    {
+                        "role": "tool", 
+                        "content": content,
+                        "tool_call_id": message.call_id
+                    }
+                )
+
+        return azure_ai_messages
+        
+
+    def _get_params(self, request: ChatCompletionRequest) -> dict:
+        """
+        Gets the parameters for the Azure AI model inference API from the Chat completions request.
+        Parameters are returned as a dictionary.
+        """
+        options = self._get_sampling_options(request.sampling_params, request.logprobs)
+        messages = self._to_azure_ai_messages(chat_completion_request_to_messages(request))
+        if (self._model_name):
+            # If the model name is already resolved, then the endpoint
+            # is serving a single model and we don't need to specify it
+            return {
+                "messages": messages,
+                **options
+            }
+        else:
+            return {
+                "messages": messages,
+                "model": SUPPORTED_INSTRUCT_MODELS.get(request.model, request.model),
+                **options
+            }
+
+    async def embeddings(
+        self,
+        model: str,
+        contents: List[InterleavedTextMedia],
+    ) -> EmbeddingsResponse:
+        raise NotImplementedError()
--- a/llama_stack/providers/adapters/inference/azure_ai_inference/config.py
+++ b/llama_stack/providers/adapters/inference/azure_ai_inference/config.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import *  # noqa: F403
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class AzureAIInferenceConfig(BaseModel):
+    endpoint: str = Field(
+        default=None,
+        description="The endpoint URL where the model(s) is/are deployed.",
+    )
+    credential: Optional[str] = Field(
+        default=None,
+        description="The secret to access the model. If None, then `DefaultAzureCredential` is attempted.",
+    )
+    api_version: Optional[str] = Field(
+        default=None,
+        description="The API version to use in the endpoint. Indicating None will use the default version in the "
+        "`azure-ai-inference` package. Default use environment variable: AZURE_AI_API_VERSION",
+    )
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -140,6 +140,15 @@ def available_providers() -> List[ProviderSpec]:
                config_class="llama_stack.providers.adapters.inference.databricks.DatabricksImplConfig",
            ),
        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="azure-ai-inference",
+                pip_packages=["azure-ai-inference", "azure-identity", "aiohttp"],
+                module="llama_stack.providers.adapters.inference.azure_ai_inference",
+                config_class="llama_stack.providers.adapters.inference.azure_ai_inference.AzureAIInferenceConfig",
+            ),
+        ),
        InlineProviderSpec(
            api=Api.inference,
            provider_type="vllm",
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -45,6 +45,9 @@ def get_sampling_options(params: SamplingParams) -> dict:
 def text_from_choice(choice) -> str:
    if hasattr(choice, "delta") and choice.delta:
        return choice.delta.content
+    
+    if hasattr(choice, "message"):
+        return choice.message.content

    return choice.text

@ -158,6 +161,9 @@ async def process_chat_completion_stream_response(
            break

        text = text_from_choice(choice)
+        if not text:
+            continue
+
        # check if its a tool call ( aka starts with <|python_tag|> )
        if not ipython and text.startswith("<|python_tag|>"):
            ipython = True