feat: OpenAI Responses API (#1989)

# What does this PR do?

This provides an initial [OpenAI Responses
API](https://platform.openai.com/docs/api-reference/responses)
implementation. The API is not yet complete, and this is more a
proof-of-concept to show how we can store responses in our key-value
stores and use them to support the Responses API concepts like
`previous_response_id`.

## Test Plan

I've added a new
`tests/integration/openai_responses/test_openai_responses.py` as part of
a test-driven development for this new API. I'm only testing this
locally with the remote-vllm provider for now, but it should work with
any of our inference providers since the only API it requires out of the
inference provider is the `openai_chat_completion` endpoint.

```
VLLM_URL="http://localhost:8000/v1" \
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \
llama stack build --template remote-vllm --image-type venv --run
```

```
LLAMA_STACK_CONFIG="http://localhost:8321" \
python -m pytest -v \
  tests/integration/openai_responses/test_openai_responses.py \
  --text-model "meta-llama/Llama-3.2-3B-Instruct"
 ```

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Ben Browning 2025-04-28 17:06:00 -04:00 committed by GitHub
parent 79851d93aa
commit 8dfce2f596
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1766 additions and 59 deletions

View file

@ -23,6 +23,9 @@ from llama_stack.apis.agents import (
Document,
ListAgentSessionsResponse,
ListAgentsResponse,
OpenAIResponseInputMessage,
OpenAIResponseInputTool,
OpenAIResponseObject,
Session,
Turn,
)
@ -40,6 +43,7 @@ from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_imp
from .agent_instance import ChatAgent
from .config import MetaReferenceAgentsImplConfig
from .openai_responses import OpenAIResponsesImpl
logger = logging.getLogger()
logger.setLevel(logging.INFO)
@ -63,9 +67,16 @@ class MetaReferenceAgentsImpl(Agents):
self.tool_groups_api = tool_groups_api
self.in_memory_store = InmemoryKVStoreImpl()
self.openai_responses_impl = None
async def initialize(self) -> None:
self.persistence_store = await kvstore_impl(self.config.persistence_store)
self.openai_responses_impl = OpenAIResponsesImpl(
self.persistence_store,
inference_api=self.inference_api,
tool_groups_api=self.tool_groups_api,
tool_runtime_api=self.tool_runtime_api,
)
# check if "bwrap" is available
if not shutil.which("bwrap"):
@ -244,3 +255,23 @@ class MetaReferenceAgentsImpl(Agents):
agent_id: str,
) -> ListAgentSessionsResponse:
pass
# OpenAI responses
async def get_openai_response(
self,
id: str,
) -> OpenAIResponseObject:
return await self.openai_responses_impl.get_openai_response(id)
async def create_openai_response(
self,
input: Union[str, List[OpenAIResponseInputMessage]],
model: str,
previous_response_id: Optional[str] = None,
store: Optional[bool] = True,
stream: Optional[bool] = False,
tools: Optional[List[OpenAIResponseInputTool]] = None,
) -> OpenAIResponseObject:
return await self.openai_responses_impl.create_openai_response(
input, model, previous_response_id, store, stream, tools
)