From 70c088af3abc5609f11cb2bcc8eb1a944e51b6e7 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 17 Apr 2025 14:47:24 -0400
Subject: [PATCH] Stub in an initial OpenAI Responses API

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../self_hosted_distro/remote-vllm.md         |   1 +
 llama_stack/apis/datatypes.py                 |   1 +
 llama_stack/apis/openai_responses/__init__.py |   7 +
 .../apis/openai_responses/openai_responses.py |  91 +++++++++++++
 llama_stack/distribution/resolver.py          |   2 +
 .../distribution/routers/routing_tables.py    |   2 +
 .../inline/openai_responses/__init__.py       |  19 +++
 .../inline/openai_responses/config.py         |  24 ++++
 .../openai_responses/openai_responses.py      | 126 ++++++++++++++++++
 .../providers/registry/openai_responses.py    |  25 ++++
 llama_stack/templates/remote-vllm/build.yaml  |   2 +
 .../remote-vllm/run-with-safety.yaml          |   9 ++
 llama_stack/templates/remote-vllm/run.yaml    |   9 ++
 llama_stack/templates/remote-vllm/vllm.py     |   1 +
 .../integration/openai_responses/__init__.py  |   5 +
 .../openai_responses/test_openai_responses.py |  90 +++++++++++++
 .../test_cases/openai/responses.json          |  26 ++++
 tests/integration/test_cases/test_case.py     |   1 +
 18 files changed, 441 insertions(+)
 create mode 100644 llama_stack/apis/openai_responses/__init__.py
 create mode 100644 llama_stack/apis/openai_responses/openai_responses.py
 create mode 100644 llama_stack/providers/inline/openai_responses/__init__.py
 create mode 100644 llama_stack/providers/inline/openai_responses/config.py
 create mode 100644 llama_stack/providers/inline/openai_responses/openai_responses.py
 create mode 100644 llama_stack/providers/registry/openai_responses.py
 create mode 100644 tests/integration/openai_responses/__init__.py
 create mode 100644 tests/integration/openai_responses/test_openai_responses.py
 create mode 100644 tests/integration/test_cases/openai/responses.json

diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 46df56008..74365722d 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -18,6 +18,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::vllm`, `inline::sentence-transformers` |
+| openai_responses | `inline::openai-responses` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 25f3ab1ab..85c0ecc6b 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -24,6 +24,7 @@ class Api(Enum):
     eval = "eval"
     post_training = "post_training"
     tool_runtime = "tool_runtime"
+    openai_responses = "openai_responses"
 
     telemetry = "telemetry"
 
diff --git a/llama_stack/apis/openai_responses/__init__.py b/llama_stack/apis/openai_responses/__init__.py
new file mode 100644
index 000000000..a3b32ff71
--- /dev/null
+++ b/llama_stack/apis/openai_responses/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .openai_responses import *  # noqa: F401 F403
diff --git a/llama_stack/apis/openai_responses/openai_responses.py b/llama_stack/apis/openai_responses/openai_responses.py
new file mode 100644
index 000000000..c8324a13a
--- /dev/null
+++ b/llama_stack/apis/openai_responses/openai_responses.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncIterator, List, Literal, Optional, Protocol, Union, runtime_checkable
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_stack.schema_utils import json_schema_type, webmethod
+
+
+@json_schema_type
+class OpenAIResponseError(BaseModel):
+    code: str
+    message: str
+
+
+@json_schema_type
+class OpenAIResponseOutputMessageContentOutputText(BaseModel):
+    text: str
+    type: Literal["output_text"] = "output_text"
+
+
+OpenAIResponseOutputMessageContent = Annotated[
+    Union[OpenAIResponseOutputMessageContentOutputText,],
+    Field(discriminator="type"),
+]
+
+
+@json_schema_type
+class OpenAIResponseOutputMessage(BaseModel):
+    id: str
+    content: List[OpenAIResponseOutputMessageContent]
+    role: Literal["assistant"] = "assistant"
+    status: str
+    type: Literal["message"] = "message"
+
+
+OpenAIResponseOutput = Annotated[
+    Union[OpenAIResponseOutputMessage,],
+    Field(discriminator="type"),
+]
+
+
+@json_schema_type
+class OpenAIResponseObject(BaseModel):
+    created_at: int
+    error: Optional[OpenAIResponseError] = None
+    id: str
+    model: str
+    object: Literal["response"] = "response"
+    output: List[OpenAIResponseOutput]
+    parallel_tool_calls: bool = False
+    previous_response_id: Optional[str] = None
+    status: str
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    truncation: Optional[str] = None
+    user: Optional[str] = None
+
+
+@json_schema_type
+class OpenAIResponseObjectStream(BaseModel):
+    response: OpenAIResponseObject
+    type: Literal["response.created"] = "response.created"
+
+
+@runtime_checkable
+class OpenAIResponses(Protocol):
+    """
+    OpenAI Responses API implementation.
+    """
+
+    @webmethod(route="/openai/v1/responses/{id}", method="GET")
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject: ...
+
+    @webmethod(route="/openai/v1/responses", method="POST")
+    async def create_openai_response(
+        self,
+        input: str,
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]: ...
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index e9a594eba..25c91fca1 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -16,6 +16,7 @@ from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
+from llama_stack.apis.openai_responses.openai_responses import OpenAIResponses
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
@@ -80,6 +81,7 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
         Api.files: Files,
+        Api.openai_responses: OpenAIResponses,
     }
 
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 18b0c891f..50416f338 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -149,6 +149,8 @@ class CommonRoutingTableImpl(RoutingTable):
                 p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
+            elif api == Api.openai_responses:
+                p.model_store = self
 
     async def shutdown(self) -> None:
         for p in self.impls_by_provider_id.values():
diff --git a/llama_stack/providers/inline/openai_responses/__init__.py b/llama_stack/providers/inline/openai_responses/__init__.py
new file mode 100644
index 000000000..6d461e81a
--- /dev/null
+++ b/llama_stack/providers/inline/openai_responses/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict
+
+from llama_stack.apis.datatypes import Api
+
+from .config import OpenAIResponsesImplConfig
+
+
+async def get_provider_impl(config: OpenAIResponsesImplConfig, deps: Dict[Api, Any]):
+    from .openai_responses import OpenAIResponsesImpl
+
+    impl = OpenAIResponsesImpl(config, deps[Api.models], deps[Api.inference])
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/inline/openai_responses/config.py b/llama_stack/providers/inline/openai_responses/config.py
new file mode 100644
index 000000000..f97b2fe68
--- /dev/null
+++ b/llama_stack/providers/inline/openai_responses/config.py
@@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict
+
+from pydantic import BaseModel
+
+from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
+
+
+class OpenAIResponsesImplConfig(BaseModel):
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="openai_responses.db",
+            )
+        }
diff --git a/llama_stack/providers/inline/openai_responses/openai_responses.py b/llama_stack/providers/inline/openai_responses/openai_responses.py
new file mode 100644
index 000000000..c23fe9b75
--- /dev/null
+++ b/llama_stack/providers/inline/openai_responses/openai_responses.py
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import uuid
+from typing import AsyncIterator, List, Optional, cast
+
+from llama_stack.apis.inference.inference import (
+    Inference,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIMessageParam,
+    OpenAIUserMessageParam,
+)
+from llama_stack.apis.models.models import Models, ModelType
+from llama_stack.apis.openai_responses import OpenAIResponses
+from llama_stack.apis.openai_responses.openai_responses import (
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+    OpenAIResponseOutputMessage,
+    OpenAIResponseOutputMessageContentOutputText,
+)
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore import kvstore_impl
+
+from .config import OpenAIResponsesImplConfig
+
+logger = get_logger(name=__name__, category="openai_responses")
+
+OPENAI_RESPONSES_PREFIX = "openai_responses:"
+
+
+class OpenAIResponsesImpl(OpenAIResponses):
+    def __init__(self, config: OpenAIResponsesImplConfig, models_api: Models, inference_api: Inference):
+        self.config = config
+        self.models_api = models_api
+        self.inference_api = inference_api
+
+    async def initialize(self) -> None:
+        self.kvstore = await kvstore_impl(self.config.kvstore)
+
+    async def shutdown(self) -> None:
+        logger.debug("OpenAIResponsesImpl.shutdown")
+        pass
+
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        key = f"{OPENAI_RESPONSES_PREFIX}{id}"
+        response_json = await self.kvstore.get(key=key)
+        if response_json is None:
+            raise ValueError(f"OpenAI response with id '{id}' not found")
+        return OpenAIResponseObject.model_validate_json(response_json)
+
+    async def create_openai_response(
+        self,
+        input: str,
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+    ):
+        model_obj = await self.models_api.get_model(model)
+        if model_obj is None:
+            raise ValueError(f"Model '{model}' not found")
+        if model_obj.model_type == ModelType.embedding:
+            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
+
+        messages: List[OpenAIMessageParam] = []
+        if previous_response_id:
+            previous_response = await self.get_openai_response(previous_response_id)
+            messages.append(OpenAIAssistantMessageParam(content=previous_response.output[0].content[0].text))
+
+        messages.append(OpenAIUserMessageParam(content=input))
+
+        chat_response = await self.inference_api.openai_chat_completion(
+            model=model_obj.identifier,
+            messages=messages,
+        )
+        # type cast to appease mypy
+        chat_response = cast(OpenAIChatCompletion, chat_response)
+
+        output_messages = []
+        for choice in chat_response.choices:
+            output_content = ""
+            if isinstance(choice.message.content, str):
+                output_content = choice.message.content
+            elif isinstance(choice.message.content, OpenAIChatCompletionContentPartTextParam):
+                output_content = choice.message.content.text
+            # TODO: handle image content
+            output_messages.append(
+                OpenAIResponseOutputMessage(
+                    id=f"msg_{uuid.uuid4()}",
+                    content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
+                    status="completed",
+                )
+            )
+        response = OpenAIResponseObject(
+            created_at=chat_response.created,
+            id=f"resp-{uuid.uuid4()}",
+            model=model_obj.identifier,
+            object="response",
+            status="completed",
+            output=output_messages,
+        )
+
+        if store:
+            # Store in kvstore
+            key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
+            await self.kvstore.set(
+                key=key,
+                value=response.model_dump_json(),
+            )
+
+        if stream:
+
+            async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]:
+                yield OpenAIResponseObjectStream(response=response)
+
+            return async_response()
+
+        return response
diff --git a/llama_stack/providers/registry/openai_responses.py b/llama_stack/providers/registry/openai_responses.py
new file mode 100644
index 000000000..dd60f19dc
--- /dev/null
+++ b/llama_stack/providers/registry/openai_responses.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.openai_responses,
+            provider_type="inline::openai-responses",
+            pip_packages=[],
+            module="llama_stack.providers.inline.openai_responses",
+            config_class="llama_stack.providers.inline.openai_responses.config.OpenAIResponsesImplConfig",
+            api_dependencies=[
+                Api.models,
+                Api.inference,
+            ],
+        ),
+    ]
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index b2bbf853a..94326e570 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -31,4 +31,6 @@ distribution_spec:
     - inline::rag-runtime
     - remote::model-context-protocol
     - remote::wolfram-alpha
+    openai_responses:
+    - inline::openai-responses
 image_type: conda
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index bb69496aa..c53228ed4 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- openai_responses
 - safety
 - scoring
 - telemetry
@@ -115,6 +116,14 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+  openai_responses:
+  - provider_id: openai-responses
+    provider_type: inline::openai-responses
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 14f2da37e..282749d0d 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -5,6 +5,7 @@ apis:
 - datasetio
 - eval
 - inference
+- openai_responses
 - safety
 - scoring
 - telemetry
@@ -108,6 +109,14 @@ providers:
     provider_type: remote::wolfram-alpha
     config:
       api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+  openai_responses:
+  - provider_id: openai-responses
+    provider_type: inline::openai-responses
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py
index 0f6c7659e..5cddb1c76 100644
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@@ -39,6 +39,7 @@ def get_distribution_template() -> DistributionTemplate:
             "remote::model-context-protocol",
             "remote::wolfram-alpha",
         ],
+        "openai_responses": ["inline::openai-responses"],
     }
     name = "remote-vllm"
     inference_provider = Provider(
diff --git a/tests/integration/openai_responses/__init__.py b/tests/integration/openai_responses/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/integration/openai_responses/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/openai_responses/test_openai_responses.py b/tests/integration/openai_responses/test_openai_responses.py
new file mode 100644
index 000000000..870c14636
--- /dev/null
+++ b/tests/integration/openai_responses/test_openai_responses.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+from openai import OpenAI
+
+from ..test_cases.test_case import TestCase
+
+
+@pytest.fixture
+def openai_client(client_with_models):
+    base_url = f"{client_with_models.base_url}/v1/openai/v1"
+    return OpenAI(base_url=base_url, api_key="bar")
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "openai:responses:non_streaming_01",
+        "openai:responses:non_streaming_02",
+    ],
+)
+def test_openai_responses_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = openai_client.responses.create(
+        model=text_model_id,
+        input=question,
+        stream=False,
+    )
+    output_text = response.output_text.lower().strip()
+    assert len(output_text) > 0
+    assert expected.lower() in output_text
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response.id)
+    assert retrieved_response.output_text == response.output_text
+
+    next_response = openai_client.responses.create(
+        model=text_model_id, input="Repeat your previous response in all caps.", previous_response_id=response.id
+    )
+    next_output_text = next_response.output_text.strip()
+    assert expected.upper() in next_output_text
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "openai:responses:streaming_01",
+        "openai:responses:streaming_02",
+    ],
+)
+def test_openai_responses_streaming(openai_client, client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = openai_client.responses.create(
+        model=text_model_id,
+        input=question,
+        stream=True,
+        timeout=120,  # Increase timeout to 2 minutes for large conversation history
+    )
+    streamed_content = []
+    response_id = ""
+    for chunk in response:
+        response_id = chunk.response.id
+        streamed_content.append(chunk.response.output_text.strip())
+
+    assert len(streamed_content) > 0
+    assert expected.lower() in "".join(streamed_content).lower()
+
+    retrieved_response = openai_client.responses.retrieve(response_id=response_id)
+    assert retrieved_response.output_text == "".join(streamed_content)
+
+    next_response = openai_client.responses.create(
+        model=text_model_id,
+        input="Repeat your previous response in all caps.",
+        previous_response_id=response_id,
+        stream=True,
+    )
+    next_streamed_content = []
+    for chunk in next_response:
+        next_streamed_content.append(chunk.response.output_text.strip())
+    assert expected.upper() in "".join(next_streamed_content)
diff --git a/tests/integration/test_cases/openai/responses.json b/tests/integration/test_cases/openai/responses.json
new file mode 100644
index 000000000..e7a132826
--- /dev/null
+++ b/tests/integration/test_cases/openai/responses.json
@@ -0,0 +1,26 @@
+{
+  "non_streaming_01": {
+    "data": {
+      "question": "Which planet do humans live on?",
+      "expected": "Earth"
+    }
+  },
+  "non_streaming_02": {
+    "data": {
+      "question": "Which planet has rings around it with a name starting with letter S?",
+      "expected": "Saturn"
+    }
+  },
+  "streaming_01": {
+    "data": {
+      "question": "What's the name of the Sun in latin?",
+      "expected": "Sol"
+    }
+  },
+  "streaming_02": {
+    "data": {
+      "question": "What is the name of the US captial?",
+      "expected": "Washington"
+    }
+  }
+}
diff --git a/tests/integration/test_cases/test_case.py b/tests/integration/test_cases/test_case.py
index 8514f3046..2a3c73310 100644
--- a/tests/integration/test_cases/test_case.py
+++ b/tests/integration/test_cases/test_case.py
@@ -12,6 +12,7 @@ class TestCase:
     _apis = [
         "inference/chat_completion",
         "inference/completion",
+        "openai/responses",
     ]
     _jsonblob = {}