OpenAI completion prompt can also be an array

The OpenAI completion prompt field can be a string or an array, so update things to use and pass that properly. This also stubs in a basic conversion of OpenAI non-streaming completion requests to Llama Stack completion calls, for those providers that don't actually have an OpenAI backend to allow them to still accept requests via the OpenAI APIs. Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-08-03 09:21:45 +00:00 · 2025-04-09 09:28:50 -04:00 · 2025-04-09 09:28:50 -04:00 · a6cf8fa12b
commit a6cf8fa12b
parent 24cfa1ef1a
10 changed files with 95 additions and 12 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -9401,7 +9401,17 @@
                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                    },
                    "prompt": {
-                        "type": "string",
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "type": "string"
+                                }
+                            }
+                        ],
                        "description": "The prompt to generate a completion for"
                    },
                    "best_of": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -6477,7 +6477,11 @@ components:
            The identifier of the model to use. The model must be registered with
            Llama Stack and available via the /models endpoint.
        prompt:
-          type: string
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                type: string
          description: The prompt to generate a completion for
        best_of:
          type: integer
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -780,7 +780,7 @@ class Inference(Protocol):
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@ -423,7 +423,7 @@ class InferenceRouter(Inference):
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -331,7 +331,7 @@ class OllamaInferenceAdapter(
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -206,7 +206,7 @@ class PassthroughInferenceAdapter(Inference):
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -260,7 +260,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -424,7 +424,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -251,7 +251,7 @@ class LiteLLMOpenAIMixin(
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.
 import json
 import logging
+import time
+import uuid
 import warnings
 from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union

@ -83,7 +85,7 @@ from llama_stack.apis.inference import (
    TopPSamplingStrategy,
    UserMessage,
 )
-from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion
+from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -844,6 +846,31 @@ def _convert_openai_logprobs(
    ]


+def _convert_openai_sampling_params(
+    max_tokens: Optional[int] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+) -> SamplingParams:
+    sampling_params = SamplingParams()
+
+    if max_tokens:
+        sampling_params.max_tokens = max_tokens
+
+    # Map an explicit temperature of 0 to greedy sampling
+    if temperature == 0:
+        strategy = GreedySamplingStrategy()
+    else:
+        # OpenAI defaults to 1.0 for temperature and top_p if unset
+        if temperature is None:
+            temperature = 1.0
+        if top_p is None:
+            top_p = 1.0
+        strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
+
+    sampling_params.strategy = strategy
+    return sampling_params
+
+
 def convert_openai_chat_completion_choice(
    choice: OpenAIChoice,
 ) -> ChatCompletionResponse:
@ -1061,7 +1088,7 @@ class OpenAICompletionUnsupportedMixin:
    async def openai_completion(
        self,
        model: str,
-        prompt: str,
+        prompt: Union[str, List[str]],
        best_of: Optional[int] = None,
        echo: Optional[bool] = None,
        frequency_penalty: Optional[float] = None,
@ -1078,7 +1105,49 @@ class OpenAICompletionUnsupportedMixin:
        top_p: Optional[float] = None,
        user: Optional[str] = None,
    ) -> OpenAICompletion:
-        raise ValueError(f"{self.__class__.__name__} doesn't support openai completion")
+        if stream:
+            raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
+
+        # This is a pretty hacky way to do emulate completions -
+        # basically just de-batches them...
+        prompts = [prompt] if not isinstance(prompt, list) else prompt
+
+        sampling_params = _convert_openai_sampling_params(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+
+        choices = []
+        # "n" is the number of completions to generate per prompt
+        for _i in range(0, n):
+            # and we may have multiple prompts, if batching was used
+
+            for prompt in prompts:
+                result = self.completion(
+                    model_id=model,
+                    content=prompt,
+                    sampling_params=sampling_params,
+                )
+
+                index = len(choices)
+                text = result.content
+                finish_reason = _convert_openai_finish_reason(result.stop_reason)
+
+                choice = OpenAICompletionChoice(
+                    index=index,
+                    text=text,
+                    finish_reason=finish_reason,
+                )
+                choices.append(choice)
+
+        return OpenAICompletion(
+            id=f"cmpl-{uuid.uuid4()}",
+            choices=choices,
+            created=int(time.time()),
+            model=model,
+            object="text_completion",
+        )


 class OpenAIChatCompletionUnsupportedMixin: