From 70c088af3abc5609f11cb2bcc8eb1a944e51b6e7 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 17 Apr 2025 14:47:24 -0400 Subject: [PATCH] Stub in an initial OpenAI Responses API Signed-off-by: Ben Browning --- .../self_hosted_distro/remote-vllm.md | 1 + llama_stack/apis/datatypes.py | 1 + llama_stack/apis/openai_responses/__init__.py | 7 + .../apis/openai_responses/openai_responses.py | 91 +++++++++++++ llama_stack/distribution/resolver.py | 2 + .../distribution/routers/routing_tables.py | 2 + .../inline/openai_responses/__init__.py | 19 +++ .../inline/openai_responses/config.py | 24 ++++ .../openai_responses/openai_responses.py | 126 ++++++++++++++++++ .../providers/registry/openai_responses.py | 25 ++++ llama_stack/templates/remote-vllm/build.yaml | 2 + .../remote-vllm/run-with-safety.yaml | 9 ++ llama_stack/templates/remote-vllm/run.yaml | 9 ++ llama_stack/templates/remote-vllm/vllm.py | 1 + .../integration/openai_responses/__init__.py | 5 + .../openai_responses/test_openai_responses.py | 90 +++++++++++++ .../test_cases/openai/responses.json | 26 ++++ tests/integration/test_cases/test_case.py | 1 + 18 files changed, 441 insertions(+) create mode 100644 llama_stack/apis/openai_responses/__init__.py create mode 100644 llama_stack/apis/openai_responses/openai_responses.py create mode 100644 llama_stack/providers/inline/openai_responses/__init__.py create mode 100644 llama_stack/providers/inline/openai_responses/config.py create mode 100644 llama_stack/providers/inline/openai_responses/openai_responses.py create mode 100644 llama_stack/providers/registry/openai_responses.py create mode 100644 tests/integration/openai_responses/__init__.py create mode 100644 tests/integration/openai_responses/test_openai_responses.py create mode 100644 tests/integration/test_cases/openai/responses.json diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index 46df56008..74365722d 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -18,6 +18,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | | inference | `remote::vllm`, `inline::sentence-transformers` | +| openai_responses | `inline::openai-responses` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index 25f3ab1ab..85c0ecc6b 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -24,6 +24,7 @@ class Api(Enum): eval = "eval" post_training = "post_training" tool_runtime = "tool_runtime" + openai_responses = "openai_responses" telemetry = "telemetry" diff --git a/llama_stack/apis/openai_responses/__init__.py b/llama_stack/apis/openai_responses/__init__.py new file mode 100644 index 000000000..a3b32ff71 --- /dev/null +++ b/llama_stack/apis/openai_responses/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .openai_responses import * # noqa: F401 F403 diff --git a/llama_stack/apis/openai_responses/openai_responses.py b/llama_stack/apis/openai_responses/openai_responses.py new file mode 100644 index 000000000..c8324a13a --- /dev/null +++ b/llama_stack/apis/openai_responses/openai_responses.py @@ -0,0 +1,91 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import AsyncIterator, List, Literal, Optional, Protocol, Union, runtime_checkable + +from pydantic import BaseModel, Field +from typing_extensions import Annotated + +from llama_stack.schema_utils import json_schema_type, webmethod + + +@json_schema_type +class OpenAIResponseError(BaseModel): + code: str + message: str + + +@json_schema_type +class OpenAIResponseOutputMessageContentOutputText(BaseModel): + text: str + type: Literal["output_text"] = "output_text" + + +OpenAIResponseOutputMessageContent = Annotated[ + Union[OpenAIResponseOutputMessageContentOutputText,], + Field(discriminator="type"), +] + + +@json_schema_type +class OpenAIResponseOutputMessage(BaseModel): + id: str + content: List[OpenAIResponseOutputMessageContent] + role: Literal["assistant"] = "assistant" + status: str + type: Literal["message"] = "message" + + +OpenAIResponseOutput = Annotated[ + Union[OpenAIResponseOutputMessage,], + Field(discriminator="type"), +] + + +@json_schema_type +class OpenAIResponseObject(BaseModel): + created_at: int + error: Optional[OpenAIResponseError] = None + id: str + model: str + object: Literal["response"] = "response" + output: List[OpenAIResponseOutput] + parallel_tool_calls: bool = False + previous_response_id: Optional[str] = None + status: str + temperature: Optional[float] = None + top_p: Optional[float] = None + truncation: Optional[str] = None + user: Optional[str] = None + + +@json_schema_type +class OpenAIResponseObjectStream(BaseModel): + response: OpenAIResponseObject + type: Literal["response.created"] = "response.created" + + +@runtime_checkable +class OpenAIResponses(Protocol): + """ + OpenAI Responses API implementation. + """ + + @webmethod(route="/openai/v1/responses/{id}", method="GET") + async def get_openai_response( + self, + id: str, + ) -> OpenAIResponseObject: ... + + @webmethod(route="/openai/v1/responses", method="POST") + async def create_openai_response( + self, + input: str, + model: str, + previous_response_id: Optional[str] = None, + store: Optional[bool] = True, + stream: Optional[bool] = False, + ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]: ... diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index e9a594eba..25c91fca1 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -16,6 +16,7 @@ from llama_stack.apis.files import Files from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models +from llama_stack.apis.openai_responses.openai_responses import OpenAIResponses from llama_stack.apis.post_training import PostTraining from llama_stack.apis.providers import Providers as ProvidersAPI from llama_stack.apis.safety import Safety @@ -80,6 +81,7 @@ def api_protocol_map() -> Dict[Api, Any]: Api.tool_groups: ToolGroups, Api.tool_runtime: ToolRuntime, Api.files: Files, + Api.openai_responses: OpenAIResponses, } diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 18b0c891f..50416f338 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -149,6 +149,8 @@ class CommonRoutingTableImpl(RoutingTable): p.benchmark_store = self elif api == Api.tool_runtime: p.tool_store = self + elif api == Api.openai_responses: + p.model_store = self async def shutdown(self) -> None: for p in self.impls_by_provider_id.values(): diff --git a/llama_stack/providers/inline/openai_responses/__init__.py b/llama_stack/providers/inline/openai_responses/__init__.py new file mode 100644 index 000000000..6d461e81a --- /dev/null +++ b/llama_stack/providers/inline/openai_responses/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict + +from llama_stack.apis.datatypes import Api + +from .config import OpenAIResponsesImplConfig + + +async def get_provider_impl(config: OpenAIResponsesImplConfig, deps: Dict[Api, Any]): + from .openai_responses import OpenAIResponsesImpl + + impl = OpenAIResponsesImpl(config, deps[Api.models], deps[Api.inference]) + await impl.initialize() + return impl diff --git a/llama_stack/providers/inline/openai_responses/config.py b/llama_stack/providers/inline/openai_responses/config.py new file mode 100644 index 000000000..f97b2fe68 --- /dev/null +++ b/llama_stack/providers/inline/openai_responses/config.py @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict + +from pydantic import BaseModel + +from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig + + +class OpenAIResponsesImplConfig(BaseModel): + kvstore: KVStoreConfig + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "kvstore": SqliteKVStoreConfig.sample_run_config( + __distro_dir__=__distro_dir__, + db_name="openai_responses.db", + ) + } diff --git a/llama_stack/providers/inline/openai_responses/openai_responses.py b/llama_stack/providers/inline/openai_responses/openai_responses.py new file mode 100644 index 000000000..c23fe9b75 --- /dev/null +++ b/llama_stack/providers/inline/openai_responses/openai_responses.py @@ -0,0 +1,126 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import uuid +from typing import AsyncIterator, List, Optional, cast + +from llama_stack.apis.inference.inference import ( + Inference, + OpenAIAssistantMessageParam, + OpenAIChatCompletion, + OpenAIChatCompletionContentPartTextParam, + OpenAIMessageParam, + OpenAIUserMessageParam, +) +from llama_stack.apis.models.models import Models, ModelType +from llama_stack.apis.openai_responses import OpenAIResponses +from llama_stack.apis.openai_responses.openai_responses import ( + OpenAIResponseObject, + OpenAIResponseObjectStream, + OpenAIResponseOutputMessage, + OpenAIResponseOutputMessageContentOutputText, +) +from llama_stack.log import get_logger +from llama_stack.providers.utils.kvstore import kvstore_impl + +from .config import OpenAIResponsesImplConfig + +logger = get_logger(name=__name__, category="openai_responses") + +OPENAI_RESPONSES_PREFIX = "openai_responses:" + + +class OpenAIResponsesImpl(OpenAIResponses): + def __init__(self, config: OpenAIResponsesImplConfig, models_api: Models, inference_api: Inference): + self.config = config + self.models_api = models_api + self.inference_api = inference_api + + async def initialize(self) -> None: + self.kvstore = await kvstore_impl(self.config.kvstore) + + async def shutdown(self) -> None: + logger.debug("OpenAIResponsesImpl.shutdown") + pass + + async def get_openai_response( + self, + id: str, + ) -> OpenAIResponseObject: + key = f"{OPENAI_RESPONSES_PREFIX}{id}" + response_json = await self.kvstore.get(key=key) + if response_json is None: + raise ValueError(f"OpenAI response with id '{id}' not found") + return OpenAIResponseObject.model_validate_json(response_json) + + async def create_openai_response( + self, + input: str, + model: str, + previous_response_id: Optional[str] = None, + store: Optional[bool] = True, + stream: Optional[bool] = False, + ): + model_obj = await self.models_api.get_model(model) + if model_obj is None: + raise ValueError(f"Model '{model}' not found") + if model_obj.model_type == ModelType.embedding: + raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions") + + messages: List[OpenAIMessageParam] = [] + if previous_response_id: + previous_response = await self.get_openai_response(previous_response_id) + messages.append(OpenAIAssistantMessageParam(content=previous_response.output[0].content[0].text)) + + messages.append(OpenAIUserMessageParam(content=input)) + + chat_response = await self.inference_api.openai_chat_completion( + model=model_obj.identifier, + messages=messages, + ) + # type cast to appease mypy + chat_response = cast(OpenAIChatCompletion, chat_response) + + output_messages = [] + for choice in chat_response.choices: + output_content = "" + if isinstance(choice.message.content, str): + output_content = choice.message.content + elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam): + output_content = choice.message.content.text + # TODO: handle image content + output_messages.append( + OpenAIResponseOutputMessage( + id=f"msg_{uuid.uuid4()}", + content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)], + status="completed", + ) + ) + response = OpenAIResponseObject( + created_at=chat_response.created, + id=f"resp-{uuid.uuid4()}", + model=model_obj.identifier, + object="response", + status="completed", + output=output_messages, + ) + + if store: + # Store in kvstore + key = f"{OPENAI_RESPONSES_PREFIX}{response.id}" + await self.kvstore.set( + key=key, + value=response.model_dump_json(), + ) + + if stream: + + async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]: + yield OpenAIResponseObjectStream(response=response) + + return async_response() + + return response diff --git a/llama_stack/providers/registry/openai_responses.py b/llama_stack/providers/registry/openai_responses.py new file mode 100644 index 000000000..dd60f19dc --- /dev/null +++ b/llama_stack/providers/registry/openai_responses.py @@ -0,0 +1,25 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec + + +def available_providers() -> List[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.openai_responses, + provider_type="inline::openai-responses", + pip_packages=[], + module="llama_stack.providers.inline.openai_responses", + config_class="llama_stack.providers.inline.openai_responses.config.OpenAIResponsesImplConfig", + api_dependencies=[ + Api.models, + Api.inference, + ], + ), + ] diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index b2bbf853a..94326e570 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -31,4 +31,6 @@ distribution_spec: - inline::rag-runtime - remote::model-context-protocol - remote::wolfram-alpha + openai_responses: + - inline::openai-responses image_type: conda diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index bb69496aa..c53228ed4 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -5,6 +5,7 @@ apis: - datasetio - eval - inference +- openai_responses - safety - scoring - telemetry @@ -115,6 +116,14 @@ providers: provider_type: remote::wolfram-alpha config: api_key: ${env.WOLFRAM_ALPHA_API_KEY:} + openai_responses: + - provider_id: openai-responses + provider_type: inline::openai-responses + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 14f2da37e..282749d0d 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -5,6 +5,7 @@ apis: - datasetio - eval - inference +- openai_responses - safety - scoring - telemetry @@ -108,6 +109,14 @@ providers: provider_type: remote::wolfram-alpha config: api_key: ${env.WOLFRAM_ALPHA_API_KEY:} + openai_responses: + - provider_id: openai-responses + provider_type: inline::openai-responses + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index 0f6c7659e..5cddb1c76 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -39,6 +39,7 @@ def get_distribution_template() -> DistributionTemplate: "remote::model-context-protocol", "remote::wolfram-alpha", ], + "openai_responses": ["inline::openai-responses"], } name = "remote-vllm" inference_provider = Provider( diff --git a/tests/integration/openai_responses/__init__.py b/tests/integration/openai_responses/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/integration/openai_responses/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/integration/openai_responses/test_openai_responses.py b/tests/integration/openai_responses/test_openai_responses.py new file mode 100644 index 000000000..870c14636 --- /dev/null +++ b/tests/integration/openai_responses/test_openai_responses.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +import pytest +from openai import OpenAI + +from ..test_cases.test_case import TestCase + + +@pytest.fixture +def openai_client(client_with_models): + base_url = f"{client_with_models.base_url}/v1/openai/v1" + return OpenAI(base_url=base_url, api_key="bar") + + +@pytest.mark.parametrize( + "test_case", + [ + "openai:responses:non_streaming_01", + "openai:responses:non_streaming_02", + ], +) +def test_openai_responses_non_streaming(openai_client, client_with_models, text_model_id, test_case): + tc = TestCase(test_case) + question = tc["question"] + expected = tc["expected"] + + response = openai_client.responses.create( + model=text_model_id, + input=question, + stream=False, + ) + output_text = response.output_text.lower().strip() + assert len(output_text) > 0 + assert expected.lower() in output_text + + retrieved_response = openai_client.responses.retrieve(response_id=response.id) + assert retrieved_response.output_text == response.output_text + + next_response = openai_client.responses.create( + model=text_model_id, input="Repeat your previous response in all caps.", previous_response_id=response.id + ) + next_output_text = next_response.output_text.strip() + assert expected.upper() in next_output_text + + +@pytest.mark.parametrize( + "test_case", + [ + "openai:responses:streaming_01", + "openai:responses:streaming_02", + ], +) +def test_openai_responses_streaming(openai_client, client_with_models, text_model_id, test_case): + tc = TestCase(test_case) + question = tc["question"] + expected = tc["expected"] + + response = openai_client.responses.create( + model=text_model_id, + input=question, + stream=True, + timeout=120, # Increase timeout to 2 minutes for large conversation history + ) + streamed_content = [] + response_id = "" + for chunk in response: + response_id = chunk.response.id + streamed_content.append(chunk.response.output_text.strip()) + + assert len(streamed_content) > 0 + assert expected.lower() in "".join(streamed_content).lower() + + retrieved_response = openai_client.responses.retrieve(response_id=response_id) + assert retrieved_response.output_text == "".join(streamed_content) + + next_response = openai_client.responses.create( + model=text_model_id, + input="Repeat your previous response in all caps.", + previous_response_id=response_id, + stream=True, + ) + next_streamed_content = [] + for chunk in next_response: + next_streamed_content.append(chunk.response.output_text.strip()) + assert expected.upper() in "".join(next_streamed_content) diff --git a/tests/integration/test_cases/openai/responses.json b/tests/integration/test_cases/openai/responses.json new file mode 100644 index 000000000..e7a132826 --- /dev/null +++ b/tests/integration/test_cases/openai/responses.json @@ -0,0 +1,26 @@ +{ + "non_streaming_01": { + "data": { + "question": "Which planet do humans live on?", + "expected": "Earth" + } + }, + "non_streaming_02": { + "data": { + "question": "Which planet has rings around it with a name starting with letter S?", + "expected": "Saturn" + } + }, + "streaming_01": { + "data": { + "question": "What's the name of the Sun in latin?", + "expected": "Sol" + } + }, + "streaming_02": { + "data": { + "question": "What is the name of the US captial?", + "expected": "Washington" + } + } +} diff --git a/tests/integration/test_cases/test_case.py b/tests/integration/test_cases/test_case.py index 8514f3046..2a3c73310 100644 --- a/tests/integration/test_cases/test_case.py +++ b/tests/integration/test_cases/test_case.py @@ -12,6 +12,7 @@ class TestCase: _apis = [ "inference/chat_completion", "inference/completion", + "openai/responses", ] _jsonblob = {}