chore: create OpenAIMixin for inference providers with an OpenAI-compat API that need to implement openai_* methods (#2835)

# What does this PR do? add an `OpenAIMixin` for use by inference providers who remote endpoints support an OpenAI compatible API. use is demonstrated by refactoring - OpenAIInferenceAdapter - NVIDIAInferenceAdapter (adds embedding support) - LlamaCompatInferenceAdapter ## Test Plan existing unit and integration tests
2025-12-03 18:00:36 +00:00 · 2025-07-23 06:49:40 -04:00 · 2025-07-23 06:49:40 -04:00 · e1ed152779
commit e1ed152779
parent fc67ad408a
7 changed files with 402 additions and 387 deletions
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -5,23 +5,9 @@
 # the root directory of this source tree.

 import logging
-from collections.abc import AsyncIterator
-from typing import Any

-from openai import AsyncOpenAI, NotFoundError
-
-from llama_stack.apis.inference import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAICompletion,
-    OpenAIEmbeddingData,
-    OpenAIEmbeddingsResponse,
-    OpenAIEmbeddingUsage,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
-)
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import OpenAIConfig
 from .models import MODEL_ENTRIES
@ -30,7 +16,7 @@ logger = logging.getLogger(__name__)


 #
-# This OpenAI adapter implements Inference methods using two clients -
+# This OpenAI adapter implements Inference methods using two mixins -
 #
 # | Inference Method           | Implementation Source    |
 # |----------------------------|--------------------------|
@ -39,11 +25,22 @@ logger = logging.getLogger(__name__)
 # | embedding                  | LiteLLMOpenAIMixin       |
 # | batch_completion           | LiteLLMOpenAIMixin       |
 # | batch_chat_completion      | LiteLLMOpenAIMixin       |
-# | openai_completion          | AsyncOpenAI              |
-# | openai_chat_completion     | AsyncOpenAI              |
-# | openai_embeddings          | AsyncOpenAI              |
+# | openai_completion          | OpenAIMixin              |
+# | openai_chat_completion     | OpenAIMixin              |
+# | openai_embeddings          | OpenAIMixin              |
 #
-class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
+class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+    """
+    OpenAI Inference Adapter for Llama Stack.
+
+    Note: The inheritance order is important here. OpenAIMixin must come before
+    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
+    is used instead of ModelRegistryHelper.check_model_availability().
+
+    - OpenAIMixin.check_model_availability() queries the OpenAI API to check if a model exists
+    - ModelRegistryHelper.check_model_availability() (inherited by LiteLLMOpenAIMixin) just returns False and shows a warning
+    """
+
    def __init__(self, config: OpenAIConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -60,191 +57,19 @@ class OpenAIInferenceAdapter(LiteLLMOpenAIMixin):
        # litellm specific model names, an abstraction leak.
        self.is_openai_compat = True

-    async def check_model_availability(self, model: str) -> bool:
+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
        """
-        Check if a specific model is available from OpenAI.
+        Get the OpenAI API base URL.

-        :param model: The model identifier to check.
-        :return: True if the model is available dynamically, False otherwise.
+        Returns the standard OpenAI API base URL for direct OpenAI API calls.
        """
-        try:
-            openai_client = self._get_openai_client()
-            retrieved_model = await openai_client.models.retrieve(model)
-            logger.info(f"Model {retrieved_model.id} is available from OpenAI")
-            return True
-
-        except NotFoundError:
-            logger.error(f"Model {model} is not available from OpenAI")
-            return False
-
-        except Exception as e:
-            logger.error(f"Failed to check model availability from OpenAI: {e}")
-            return False
+        return "https://api.openai.com/v1"

    async def initialize(self) -> None:
        await super().initialize()

    async def shutdown(self) -> None:
        await super().shutdown()
-
-    def _get_openai_client(self) -> AsyncOpenAI:
-        return AsyncOpenAI(
-            api_key=self.get_api_key(),
-        )
-
-    async def openai_completion(
-        self,
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        suffix: str | None = None,
-    ) -> OpenAICompletion:
-        if guided_choice is not None:
-            logging.warning("guided_choice is not supported by the OpenAI API. Ignoring.")
-        if prompt_logprobs is not None:
-            logging.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
-
-        model_id = (await self.model_store.get_model(model)).provider_resource_id
-        if model_id.startswith("openai/"):
-            model_id = model_id[len("openai/") :]
-        params = await prepare_openai_completion_params(
-            model=model_id,
-            prompt=prompt,
-            best_of=best_of,
-            echo=echo,
-            frequency_penalty=frequency_penalty,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
-            suffix=suffix,
-        )
-        return await self._get_openai_client().completions.create(**params)
-
-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_id = (await self.model_store.get_model(model)).provider_resource_id
-        if model_id.startswith("openai/"):
-            model_id = model_id[len("openai/") :]
-        params = await prepare_openai_completion_params(
-            model=model_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
-        return await self._get_openai_client().chat.completions.create(**params)
-
-    async def openai_embeddings(
-        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
-    ) -> OpenAIEmbeddingsResponse:
-        model_id = (await self.model_store.get_model(model)).provider_resource_id
-        if model_id.startswith("openai/"):
-            model_id = model_id[len("openai/") :]
-
-        # Prepare parameters for OpenAI embeddings API
-        params = {
-            "model": model_id,
-            "input": input,
-        }
-
-        if encoding_format is not None:
-            params["encoding_format"] = encoding_format
-        if dimensions is not None:
-            params["dimensions"] = dimensions
-        if user is not None:
-            params["user"] = user
-
-        # Call OpenAI embeddings API
-        response = await self._get_openai_client().embeddings.create(**params)
-
-        data = []
-        for i, embedding_data in enumerate(response.data):
-            data.append(
-                OpenAIEmbeddingData(
-                    embedding=embedding_data.embedding,
-                    index=i,
-                )
-            )
-
-        usage = OpenAIEmbeddingUsage(
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-        )
-
-        return OpenAIEmbeddingsResponse(
-            data=data,
-            model=response.model,
-            usage=usage,
-        )