feat: OpenAI Responses API (#1989)

# What does this PR do?

This provides an initial [OpenAI Responses
API](https://platform.openai.com/docs/api-reference/responses)
implementation. The API is not yet complete, and this is more a
proof-of-concept to show how we can store responses in our key-value
stores and use them to support the Responses API concepts like
`previous_response_id`.

## Test Plan

I've added a new
`tests/integration/openai_responses/test_openai_responses.py` as part of
a test-driven development for this new API. I'm only testing this
locally with the remote-vllm provider for now, but it should work with
any of our inference providers since the only API it requires out of the
inference provider is the `openai_chat_completion` endpoint.

```
VLLM_URL="http://localhost:8000/v1" \
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \
llama stack build --template remote-vllm --image-type venv --run
```

```
LLAMA_STACK_CONFIG="http://localhost:8321" \
python -m pytest -v \
  tests/integration/openai_responses/test_openai_responses.py \
  --text-model "meta-llama/Llama-3.2-3B-Instruct"
 ```

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
This commit is contained in:
Ben Browning 2025-04-28 17:06:00 -04:00 committed by GitHub
parent 79851d93aa
commit 8dfce2f596
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1766 additions and 59 deletions

View file

@ -38,6 +38,13 @@ from llama_stack.apis.safety import SafetyViolation
from llama_stack.apis.tools import ToolDef
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
from .openai_responses import (
OpenAIResponseInputMessage,
OpenAIResponseInputTool,
OpenAIResponseObject,
OpenAIResponseObjectStream,
)
class Attachment(BaseModel):
"""An attachment to an agent turn.
@ -593,3 +600,39 @@ class Agents(Protocol):
:returns: A ListAgentSessionsResponse.
"""
...
# We situate the OpenAI Responses API in the Agents API just like we did things
# for Inference. The Responses API, in its intent, serves the same purpose as
# the Agents API above -- it is essentially a lightweight "agentic loop" with
# integrated tool calling.
#
# Both of these APIs are inherently stateful.
@webmethod(route="/openai/v1/responses/{id}", method="GET")
async def get_openai_response(
self,
id: str,
) -> OpenAIResponseObject:
"""Retrieve an OpenAI response by its ID.
:param id: The ID of the OpenAI response to retrieve.
:returns: An OpenAIResponseObject.
"""
...
@webmethod(route="/openai/v1/responses", method="POST")
async def create_openai_response(
self,
input: Union[str, List[OpenAIResponseInputMessage]],
model: str,
previous_response_id: Optional[str] = None,
store: Optional[bool] = True,
stream: Optional[bool] = False,
tools: Optional[List[OpenAIResponseInputTool]] = None,
) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]:
"""Create a new OpenAI response.
:param input: Input message(s) to create the response.
:param model: The underlying LLM used for completions.
:param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
"""

View file

@ -0,0 +1,140 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List, Literal, Optional, Union
from pydantic import BaseModel, Field
from typing_extensions import Annotated
from llama_stack.schema_utils import json_schema_type, register_schema
@json_schema_type
class OpenAIResponseError(BaseModel):
code: str
message: str
@json_schema_type
class OpenAIResponseOutputMessageContentOutputText(BaseModel):
text: str
type: Literal["output_text"] = "output_text"
OpenAIResponseOutputMessageContent = Annotated[
Union[OpenAIResponseOutputMessageContentOutputText,],
Field(discriminator="type"),
]
register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
@json_schema_type
class OpenAIResponseOutputMessage(BaseModel):
id: str
content: List[OpenAIResponseOutputMessageContent]
role: Literal["assistant"] = "assistant"
status: str
type: Literal["message"] = "message"
@json_schema_type
class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
id: str
status: str
type: Literal["web_search_call"] = "web_search_call"
OpenAIResponseOutput = Annotated[
Union[
OpenAIResponseOutputMessage,
OpenAIResponseOutputMessageWebSearchToolCall,
],
Field(discriminator="type"),
]
register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
@json_schema_type
class OpenAIResponseObject(BaseModel):
created_at: int
error: Optional[OpenAIResponseError] = None
id: str
model: str
object: Literal["response"] = "response"
output: List[OpenAIResponseOutput]
parallel_tool_calls: bool = False
previous_response_id: Optional[str] = None
status: str
temperature: Optional[float] = None
top_p: Optional[float] = None
truncation: Optional[str] = None
user: Optional[str] = None
@json_schema_type
class OpenAIResponseObjectStreamResponseCreated(BaseModel):
response: OpenAIResponseObject
type: Literal["response.created"] = "response.created"
@json_schema_type
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
response: OpenAIResponseObject
type: Literal["response.completed"] = "response.completed"
OpenAIResponseObjectStream = Annotated[
Union[
OpenAIResponseObjectStreamResponseCreated,
OpenAIResponseObjectStreamResponseCompleted,
],
Field(discriminator="type"),
]
register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
@json_schema_type
class OpenAIResponseInputMessageContentText(BaseModel):
text: str
type: Literal["input_text"] = "input_text"
@json_schema_type
class OpenAIResponseInputMessageContentImage(BaseModel):
detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
type: Literal["input_image"] = "input_image"
# TODO: handle file_id
image_url: Optional[str] = None
# TODO: handle file content types
OpenAIResponseInputMessageContent = Annotated[
Union[OpenAIResponseInputMessageContentText, OpenAIResponseInputMessageContentImage],
Field(discriminator="type"),
]
register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
@json_schema_type
class OpenAIResponseInputMessage(BaseModel):
content: Union[str, List[OpenAIResponseInputMessageContent]]
role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
type: Optional[Literal["message"]] = "message"
@json_schema_type
class OpenAIResponseInputToolWebSearch(BaseModel):
type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
# TODO: actually use search_context_size somewhere...
search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$")
# TODO: add user_location
OpenAIResponseInputTool = Annotated[
Union[OpenAIResponseInputToolWebSearch,],
Field(discriminator="type"),
]
register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")