chore: remove /v1/inference/completion and implementations

2025-10-04 04:04:14 +00:00 · 2025-09-30 03:46:07 -04:00 · 2025-09-30 03:46:07 -04:00 · 4b641d7127
commit 4b641d7127
parent 606f4cf281
78 changed files with 16143 additions and 17755 deletions
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -103,8 +103,6 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    Message,
    OpenAIChatCompletion,
-    OpenAICompletion,
-    OpenAICompletionChoice,
    OpenAIEmbeddingData,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
@ -1281,76 +1279,6 @@ async def prepare_openai_completion_params(**params):
    return completion_params


-class OpenAICompletionToLlamaStackMixin:
-    async def openai_completion(
-        self,
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        suffix: str | None = None,
-    ) -> OpenAICompletion:
-        if stream:
-            raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
-
-        # This is a pretty hacky way to do emulate completions -
-        # basically just de-batches them...
-        prompts = [prompt] if not isinstance(prompt, list) else prompt
-
-        sampling_params = _convert_openai_sampling_params(
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-        )
-
-        choices = []
-        # "n" is the number of completions to generate per prompt
-        n = n or 1
-        for _i in range(0, n):
-            # and we may have multiple prompts, if batching was used
-
-            for prompt in prompts:
-                result = self.completion(
-                    model_id=model,
-                    content=prompt,
-                    sampling_params=sampling_params,
-                )
-
-                index = len(choices)
-                text = result.content
-                finish_reason = _convert_stop_reason_to_openai_finish_reason(result.stop_reason)
-
-                choice = OpenAICompletionChoice(
-                    index=index,
-                    text=text,
-                    finish_reason=finish_reason,
-                )
-                choices.append(choice)
-
-        return OpenAICompletion(
-            id=f"cmpl-{uuid.uuid4()}",
-            choices=choices,
-            created=int(time.time()),
-            model=model,
-            object="text_completion",
-        )
-
-
 class OpenAIChatCompletionToLlamaStackMixin:
    async def openai_chat_completion(
        self,