From a193c9fc3f30c99c0897a49f38a2ba9c2e8f2f2c Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Mon, 7 Apr 2025 21:27:06 -0400 Subject: [PATCH] Add OpenAI-Compatible models, completions, chat/completions endpoints This stubs in some OpenAI server-side compatibility with three new endpoints: /v1/openai/v1/models /v1/openai/v1/completions /v1/openai/v1/chat/completions This gives common inference apps using OpenAI clients the ability to talk to Llama Stack using an endpoint like http://localhost:8321/v1/openai/v1 . The two "v1" instances in there isn't awesome, but the thinking is that Llama Stack's API is v1 and then our OpenAI compatibility layer is compatible with OpenAI V1. And, some OpenAI clients implicitly assume the URL ends with "v1", so this gives maximum compatibility. The openai models endpoint is implemented in the routing layer, and just returns all the models Llama Stack knows about. The chat endpoints are only actually implemented for the remote-vllm provider right now, and it just proxies the completion and chat completion requests to the backend vLLM. The goal to support this for every inference provider - proxying directly to the provider's OpenAI endpoint for OpenAI-compatible providers. For providers that don't have an OpenAI-compatible API, we'll add a mixin to translate incoming OpenAI requests to Llama Stack inference requests and translate the Llama Stack inference responses to OpenAI responses. --- llama_stack/apis/inference/inference.py | 57 +++++++++ llama_stack/apis/models/models.py | 8 ++ llama_stack/distribution/routers/routers.py | 120 ++++++++++++++++++ .../distribution/routers/routing_tables.py | 17 ++- .../inference/meta_reference/inference.py | 6 + .../sentence_transformers.py | 56 +++++++- .../remote/inference/ollama/ollama.py | 9 +- .../providers/remote/inference/vllm/vllm.py | 109 +++++++++++++++- .../utils/inference/openai_compat.py | 58 ++++++++- pyproject.toml | 1 + requirements.txt | 2 + uv.lock | 8 +- 12 files changed, 443 insertions(+), 8 deletions(-) diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index e59132e33..864bef2d5 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -17,6 +17,9 @@ from typing import ( runtime_checkable, ) +from openai.types.chat import ChatCompletion as OpenAIChatCompletion +from openai.types.chat import ChatCompletionMessageParam as OpenAIChatCompletionMessageParam +from openai.types.completion import Completion as OpenAICompletion from pydantic import BaseModel, Field, field_validator from typing_extensions import Annotated @@ -564,3 +567,57 @@ class Inference(Protocol): :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id} """ ... + + @webmethod(route="/openai/v1/completions", method="POST") + async def openai_completion( + self, + model: str, + prompt: str, + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAICompletion: + """Generate an OpenAI-compatible completion for the given prompt using the specified model.""" + ... + + @webmethod(route="/openai/v1/chat/completions", method="POST") + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + """Generate an OpenAI-compatible chat completion for the given messages using the specified model.""" + ... diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 893ebc179..e48add882 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -7,6 +7,7 @@ from enum import Enum from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable +from openai.types.model import Model as OpenAIModel from pydantic import BaseModel, ConfigDict, Field from llama_stack.apis.resource import Resource, ResourceType @@ -56,12 +57,19 @@ class ListModelsResponse(BaseModel): data: List[Model] +class OpenAIListModelsResponse(BaseModel): + data: List[OpenAIModel] + + @runtime_checkable @trace_protocol class Models(Protocol): @webmethod(route="/models", method="GET") async def list_models(self) -> ListModelsResponse: ... + @webmethod(route="/openai/v1/models", method="GET") + async def openai_list_models(self) -> OpenAIListModelsResponse: ... + @webmethod(route="/models/{model_id:path}", method="GET") async def get_model( self, diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index eed96a40a..146ac5021 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -7,6 +7,10 @@ import time from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union +from openai.types.chat import ChatCompletion as OpenAIChatCompletion +from openai.types.chat import ChatCompletionMessageParam as OpenAIChatCompletionMessageParam +from openai.types.completion import Completion as OpenAICompletion + from llama_stack.apis.common.content_types import ( URL, InterleavedContent, @@ -419,6 +423,122 @@ class InferenceRouter(Inference): task_type=task_type, ) + async def openai_completion( + self, + model: str, + prompt: str, + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAICompletion: + logger.debug( + f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", + ) + model_obj = await self.routing_table.get_model(model) + if model_obj is None: + raise ValueError(f"Model '{model}' not found") + if model_obj.model_type == ModelType.embedding: + raise ValueError(f"Model '{model}' is an embedding model and does not support completions") + + params = dict( + model=model_obj.identifier, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + ) + + provider = self.routing_table.get_provider_impl(model_obj.identifier) + return await provider.openai_completion(**params) + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + logger.debug( + f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}", + ) + model_obj = await self.routing_table.get_model(model) + if model_obj is None: + raise ValueError(f"Model '{model}' not found") + if model_obj.model_type == ModelType.embedding: + raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions") + + params = dict( + model=model_obj.identifier, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + + provider = self.routing_table.get_provider_impl(model_obj.identifier) + return await provider.openai_chat_completion(**params) + class SafetyRouter(Safety): def __init__( diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index f6adae49d..5ec90864e 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -5,9 +5,11 @@ # the root directory of this source tree. import logging +import time import uuid from typing import Any, Dict, List, Optional +from openai.types.model import Model as OpenAIModel from pydantic import TypeAdapter from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse @@ -23,7 +25,7 @@ from llama_stack.apis.datasets import ( RowsDataSource, URIDataSource, ) -from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType +from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse from llama_stack.apis.resource import ResourceType from llama_stack.apis.scoring_functions import ( ListScoringFunctionsResponse, @@ -254,6 +256,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): async def list_models(self) -> ListModelsResponse: return ListModelsResponse(data=await self.get_all_with_type("model")) + async def openai_list_models(self) -> OpenAIListModelsResponse: + models = await self.get_all_with_type("model") + openai_models = [ + OpenAIModel( + id=model.identifier, + object="model", + created=int(time.time()), + owned_by="llama_stack", + ) + for model in models + ] + return OpenAIListModelsResponse(data=openai_models) + async def get_model(self, model_id: str) -> Model: model = await self.get_object_by_identifier("model", model_id) if model is None: diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 5f81d6421..3a7632065 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -54,6 +54,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, build_hf_repo_model_entry, ) +from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +) from llama_stack.providers.utils.inference.prompt_adapter import ( augment_content_with_response_format_prompt, chat_completion_request_to_messages, @@ -79,6 +83,8 @@ def llama4_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama class MetaReferenceInferenceImpl( + OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionUnsupportedMixin, SentenceTransformerEmbeddingMixin, Inference, ModelsProtocolPrivate, diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 39847e085..26a34064d 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -5,7 +5,11 @@ # the root directory of this source tree. import logging -from typing import AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union + +from openai.types.chat import ChatCompletion as OpenAIChatCompletion +from openai.types.chat import ChatCompletionMessageParam as OpenAIChatCompletionMessageParam +from openai.types.completion import Completion as OpenAICompletion from llama_stack.apis.inference import ( CompletionResponse, @@ -74,3 +78,53 @@ class SentenceTransformersInferenceImpl( tool_config: Optional[ToolConfig] = None, ) -> AsyncGenerator: raise ValueError("Sentence transformers don't support chat completion") + + async def openai_completion( + self, + model: str, + prompt: str, + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAICompletion: + raise ValueError("Sentence transformers don't support openai completion") + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + raise ValueError("Sentence transformers don't support openai chat completion") diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 12902996b..944493b6d 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -45,8 +45,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + OpenAICompletionUnsupportedMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -67,7 +69,12 @@ from .models import model_entries logger = get_logger(name=__name__, category="inference") -class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): +class OllamaInferenceAdapter( + OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionUnsupportedMixin, + Inference, + ModelsProtocolPrivate, +): def __init__(self, url: str) -> None: self.register_helper = ModelRegistryHelper(model_entries) self.url = url diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 6a828322f..18e6a1972 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -5,13 +5,16 @@ # the root directory of this source tree. import json import logging -from typing import Any, AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union import httpx from openai import AsyncOpenAI +from openai.types.chat import ChatCompletion as OpenAIChatCompletion +from openai.types.chat import ChatCompletionMessageParam as OpenAIChatCompletionMessageParam from openai.types.chat.chat_completion_chunk import ( ChatCompletionChunk as OpenAIChatCompletionChunk, ) +from openai.types.completion import Completion as OpenAICompletion from llama_stack.apis.common.content_types import ( InterleavedContent, @@ -418,3 +421,107 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): embeddings = [data.embedding for data in response.data] return EmbeddingsResponse(embeddings=embeddings) + + async def openai_completion( + self, + model: str, + prompt: str, + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAICompletion: + model_obj = await self._get_model(model) + params = { + k: v + for k, v in { + "model": model_obj.provider_resource_id, + "prompt": prompt, + "best_of": best_of, + "echo": echo, + "frequency_penalty": frequency_penalty, + "logit_bias": logit_bias, + "logprobs": logprobs, + "max_tokens": max_tokens, + "n": n, + "presence_penalty": presence_penalty, + "seed": seed, + "stop": stop, + "stream": stream, + "stream_options": stream_options, + "temperature": temperature, + "top_p": top_p, + "user": user, + }.items() + if v is not None + } + return await self.client.completions.create(**params) # type: ignore + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self._get_model(model) + params = { + k: v + for k, v in { + "model": model_obj.provider_resource_id, + "messages": messages, + "frequency_penalty": frequency_penalty, + "function_call": function_call, + "functions": functions, + "logit_bias": logit_bias, + "logprobs": logprobs, + "max_completion_tokens": max_completion_tokens, + "max_tokens": max_tokens, + "n": n, + "parallel_tool_calls": parallel_tool_calls, + "presence_penalty": presence_penalty, + "response_format": response_format, + "seed": seed, + "stop": stop, + "stream": stream, + "stream_options": stream_options, + "temperature": temperature, + "tool_choice": tool_choice, + "tools": tools, + "top_logprobs": top_logprobs, + "top_p": top_p, + "user": user, + }.items() + if v is not None + } + return await self.client.chat.completions.create(**params) # type: ignore diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 0f3945b34..3f1846b76 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -6,9 +6,10 @@ import json import logging import warnings -from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union from openai import AsyncStream +from openai.types.chat import ChatCompletion as OpenAIChatCompletion from openai.types.chat import ( ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage, ) @@ -54,6 +55,7 @@ from openai.types.chat.chat_completion_content_part_image_param import ( from openai.types.chat.chat_completion_message_tool_call_param import ( Function as OpenAIFunction, ) +from openai.types.completion import Completion as OpenAICompletion from pydantic import BaseModel from llama_stack.apis.common.content_types import ( @@ -1049,3 +1051,57 @@ async def convert_openai_chat_completion_stream( stop_reason=stop_reason, ) ) + + +class OpenAICompletionUnsupportedMixin: + async def openai_completion( + self, + model: str, + prompt: str, + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAICompletion: + raise ValueError(f"{self.__class__.__name__} doesn't support openai completion") + + +class OpenAIChatCompletionUnsupportedMixin: + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessage], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion") diff --git a/pyproject.toml b/pyproject.toml index 83260b681..9ef3abe68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "jinja2>=3.1.6", "jsonschema", "llama-stack-client>=0.2.1", + "openai>=1.66", "prompt-toolkit", "python-dotenv", "pydantic>=2", diff --git a/requirements.txt b/requirements.txt index 6645e4e36..ef5782905 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ httpx==0.28.1 huggingface-hub==0.29.0 idna==3.10 jinja2==3.1.6 +jiter==0.8.2 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 llama-stack-client==0.2.1 @@ -27,6 +28,7 @@ markdown-it-py==3.0.0 markupsafe==3.0.2 mdurl==0.1.2 numpy==2.2.3 +openai==1.71.0 packaging==24.2 pandas==2.2.3 pillow==11.1.0 diff --git a/uv.lock b/uv.lock index 1f7adea82..c6c9b1004 100644 --- a/uv.lock +++ b/uv.lock @@ -1384,6 +1384,7 @@ dependencies = [ { name = "jinja2" }, { name = "jsonschema" }, { name = "llama-stack-client" }, + { name = "openai" }, { name = "pillow" }, { name = "prompt-toolkit" }, { name = "pydantic" }, @@ -1485,6 +1486,7 @@ requires-dist = [ { name = "mcp", marker = "extra == 'test'" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, + { name = "openai", specifier = ">=1.66" }, { name = "openai", marker = "extra == 'test'" }, { name = "openai", marker = "extra == 'unit'" }, { name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" }, @@ -2016,7 +2018,7 @@ wheels = [ [[package]] name = "openai" -version = "1.63.2" +version = "1.71.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2028,9 +2030,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 } +sdist = { url = "https://files.pythonhosted.org/packages/d9/19/b8f0347090a649dce55a008ec54ac6abb50553a06508cdb5e7abb2813e99/openai-1.71.0.tar.gz", hash = "sha256:52b20bb990a1780f9b0b8ccebac93416343ebd3e4e714e3eff730336833ca207", size = 409926 } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 }, + { url = "https://files.pythonhosted.org/packages/c4/f7/049e85faf6a000890e5ca0edca8e9183f8a43c9e7bba869cad871da0caba/openai-1.71.0-py3-none-any.whl", hash = "sha256:e1c643738f1fff1af52bce6ef06a7716c95d089281e7011777179614f32937aa", size = 598975 }, ] [[package]]