fold openai responses into the Agents API

2025-12-30 17:33:11 +00:00 · 2025-04-28 10:27:28 -07:00 · 2025-04-28 10:27:28 -07:00 · abd6280cb8
commit abd6280cb8
parent 207224a811
25 changed files with 967 additions and 199 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -497,6 +497,54 @@
                }
            }
        },
        "/v1/openai/v1/responses": {
            "post": {
                "responses": {
                    "200": {
                        "description": "Runtime representation of an annotated type.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAIResponseObject"
                                }
                            },
                            "text/event-stream": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAIResponseObjectStream"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Agents"
                ],
                "description": "Create a new OpenAI response.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CreateOpenaiResponseRequest"
                            }
                        }
                    },
                    "required": true
                }
            }
        },
        "/v1/files": {
            "get": {
                "responses": {
@ -1278,6 +1326,49 @@
                ]
            }
        },
        "/v1/openai/v1/responses/{id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "An OpenAIResponseObject.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/OpenAIResponseObject"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Agents"
                ],
                "description": "Retrieve an OpenAI response by its ID.",
                "parameters": [
                    {
                        "name": "id",
                        "in": "path",
                        "description": "The ID of the OpenAI response to retrieve.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ]
            }
        },
        "/v1/scoring-functions/{scoring_fn_id}": {
            "get": {
                "responses": {
@ -6192,6 +6283,427 @@
                ],
                "title": "AgentTurnResponseTurnStartPayload"
            },
            "OpenAIResponseInputMessage": {
                "type": "object",
                "properties": {
                    "content": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
                                }
                            }
                        ]
                    },
                    "role": {
                        "oneOf": [
                            {
                                "type": "string",
                                "const": "system"
                            },
                            {
                                "type": "string",
                                "const": "developer"
                            },
                            {
                                "type": "string",
                                "const": "user"
                            },
                            {
                                "type": "string",
                                "const": "assistant"
                            }
                        ]
                    },
                    "type": {
                        "type": "string",
                        "const": "message",
                        "default": "message"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "content",
                    "role"
                ],
                "title": "OpenAIResponseInputMessage"
            },
            "OpenAIResponseInputMessageContent": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
                    }
                }
            },
            "OpenAIResponseInputMessageContentImage": {
                "type": "object",
                "properties": {
                    "detail": {
                        "oneOf": [
                            {
                                "type": "string",
                                "const": "low"
                            },
                            {
                                "type": "string",
                                "const": "high"
                            },
                            {
                                "type": "string",
                                "const": "auto"
                            }
                        ],
                        "default": "auto"
                    },
                    "type": {
                        "type": "string",
                        "const": "input_image",
                        "default": "input_image"
                    },
                    "image_url": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "detail",
                    "type"
                ],
                "title": "OpenAIResponseInputMessageContentImage"
            },
            "OpenAIResponseInputMessageContentText": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "input_text",
                        "default": "input_text"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "text",
                    "type"
                ],
                "title": "OpenAIResponseInputMessageContentText"
            },
            "OpenAIResponseInputTool": {
                "type": "object",
                "properties": {
                    "type": {
                        "oneOf": [
                            {
                                "type": "string",
                                "const": "web_search"
                            },
                            {
                                "type": "string",
                                "const": "web_search_preview_2025_03_11"
                            }
                        ],
                        "default": "web_search"
                    },
                    "search_context_size": {
                        "type": "string",
                        "default": "medium"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "OpenAIResponseInputToolWebSearch"
            },
            "CreateOpenaiResponseRequest": {
                "type": "object",
                "properties": {
                    "input": {
                        "oneOf": [
                            {
                                "type": "string"
                            },
                            {
                                "type": "array",
                                "items": {
                                    "$ref": "#/components/schemas/OpenAIResponseInputMessage"
                                }
                            }
                        ],
                        "description": "Input message(s) to create the response."
                    },
                    "model": {
                        "type": "string",
                        "description": "The underlying LLM used for completions."
                    },
                    "previous_response_id": {
                        "type": "string",
                        "description": "(Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses."
                    },
                    "store": {
                        "type": "boolean"
                    },
                    "stream": {
                        "type": "boolean"
                    },
                    "tools": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
                        }
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input",
                    "model"
                ],
                "title": "CreateOpenaiResponseRequest"
            },
            "OpenAIResponseError": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string"
                    },
                    "message": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "code",
                    "message"
                ],
                "title": "OpenAIResponseError"
            },
            "OpenAIResponseObject": {
                "type": "object",
                "properties": {
                    "created_at": {
                        "type": "integer"
                    },
                    "error": {
                        "$ref": "#/components/schemas/OpenAIResponseError"
                    },
                    "id": {
                        "type": "string"
                    },
                    "model": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "response",
                        "default": "response"
                    },
                    "output": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIResponseOutput"
                        }
                    },
                    "parallel_tool_calls": {
                        "type": "boolean",
                        "default": false
                    },
                    "previous_response_id": {
                        "type": "string"
                    },
                    "status": {
                        "type": "string"
                    },
                    "temperature": {
                        "type": "number"
                    },
                    "top_p": {
                        "type": "number"
                    },
                    "truncation": {
                        "type": "string"
                    },
                    "user": {
                        "type": "string"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "created_at",
                    "id",
                    "model",
                    "object",
                    "output",
                    "parallel_tool_calls",
                    "status"
                ],
                "title": "OpenAIResponseObject"
            },
            "OpenAIResponseOutput": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessage"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "message": "#/components/schemas/OpenAIResponseOutputMessage",
                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
                    }
                }
            },
            "OpenAIResponseOutputMessage": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "content": {
                        "type": "array",
                        "items": {
                            "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
                        }
                    },
                    "role": {
                        "type": "string",
                        "const": "assistant",
                        "default": "assistant"
                    },
                    "status": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "message",
                        "default": "message"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "content",
                    "role",
                    "status",
                    "type"
                ],
                "title": "OpenAIResponseOutputMessage"
            },
            "OpenAIResponseOutputMessageContent": {
                "type": "object",
                "properties": {
                    "text": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "output_text",
                        "default": "output_text"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "text",
                    "type"
                ],
                "title": "OpenAIResponseOutputMessageContentOutputText"
            },
            "OpenAIResponseOutputMessageWebSearchToolCall": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "status": {
                        "type": "string"
                    },
                    "type": {
                        "type": "string",
                        "const": "web_search_call",
                        "default": "web_search_call"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "status",
                    "type"
                ],
                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
            },
            "OpenAIResponseObjectStream": {
                "oneOf": [
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
                    }
                }
            },
            "OpenAIResponseObjectStreamResponseCompleted": {
                "type": "object",
                "properties": {
                    "response": {
                        "$ref": "#/components/schemas/OpenAIResponseObject"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.completed",
                        "default": "response.completed"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseCompleted"
            },
            "OpenAIResponseObjectStreamResponseCreated": {
                "type": "object",
                "properties": {
                    "response": {
                        "$ref": "#/components/schemas/OpenAIResponseObject"
                    },
                    "type": {
                        "type": "string",
                        "const": "response.created",
                        "default": "response.created"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "response",
                    "type"
                ],
                "title": "OpenAIResponseObjectStreamResponseCreated"
            },
            "CreateUploadSessionRequest": {
                "type": "object",
                "properties": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -330,6 +330,39 @@ paths:
            schema:
              $ref: '#/components/schemas/CreateAgentTurnRequest'
        required: true
  /v1/openai/v1/responses:
    post:
      responses:
        '200':
          description: >-
            Runtime representation of an annotated type.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAIResponseObject'
            text/event-stream:
              schema:
                $ref: '#/components/schemas/OpenAIResponseObjectStream'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
      description: Create a new OpenAI response.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
        required: true
  /v1/files:
    get:
      responses:
@ -875,6 +908,36 @@ paths:
          required: true
          schema:
            type: string
  /v1/openai/v1/responses/{id}:
    get:
      responses:
        '200':
          description: An OpenAIResponseObject.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAIResponseObject'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
      description: Retrieve an OpenAI response by its ID.
      parameters:
        - name: id
          in: path
          description: >-
            The ID of the OpenAI response to retrieve.
          required: true
          schema:
            type: string
  /v1/scoring-functions/{scoring_fn_id}:
    get:
      responses:
@ -4329,6 +4392,293 @@ components:
        - event_type
        - turn_id
      title: AgentTurnResponseTurnStartPayload
    OpenAIResponseInputMessage:
      type: object
      properties:
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
        role:
          oneOf:
            - type: string
              const: system
            - type: string
              const: developer
            - type: string
              const: user
            - type: string
              const: assistant
        type:
          type: string
          const: message
          default: message
      additionalProperties: false
      required:
        - content
        - role
      title: OpenAIResponseInputMessage
    OpenAIResponseInputMessageContent:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
      discriminator:
        propertyName: type
        mapping:
          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
    OpenAIResponseInputMessageContentImage:
      type: object
      properties:
        detail:
          oneOf:
            - type: string
              const: low
            - type: string
              const: high
            - type: string
              const: auto
          default: auto
        type:
          type: string
          const: input_image
          default: input_image
        image_url:
          type: string
      additionalProperties: false
      required:
        - detail
        - type
      title: OpenAIResponseInputMessageContentImage
    OpenAIResponseInputMessageContentText:
      type: object
      properties:
        text:
          type: string
        type:
          type: string
          const: input_text
          default: input_text
      additionalProperties: false
      required:
        - text
        - type
      title: OpenAIResponseInputMessageContentText
    OpenAIResponseInputTool:
      type: object
      properties:
        type:
          oneOf:
            - type: string
              const: web_search
            - type: string
              const: web_search_preview_2025_03_11
          default: web_search
        search_context_size:
          type: string
          default: medium
      additionalProperties: false
      required:
        - type
      title: OpenAIResponseInputToolWebSearch
    CreateOpenaiResponseRequest:
      type: object
      properties:
        input:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIResponseInputMessage'
          description: Input message(s) to create the response.
        model:
          type: string
          description: The underlying LLM used for completions.
        previous_response_id:
          type: string
          description: >-
            (Optional) if specified, the new response will be a continuation of the
            previous response. This can be used to easily fork-off new responses from
            existing responses.
        store:
          type: boolean
        stream:
          type: boolean
        tools:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseInputTool'
      additionalProperties: false
      required:
        - input
        - model
      title: CreateOpenaiResponseRequest
    OpenAIResponseError:
      type: object
      properties:
        code:
          type: string
        message:
          type: string
      additionalProperties: false
      required:
        - code
        - message
      title: OpenAIResponseError
    OpenAIResponseObject:
      type: object
      properties:
        created_at:
          type: integer
        error:
          $ref: '#/components/schemas/OpenAIResponseError'
        id:
          type: string
        model:
          type: string
        object:
          type: string
          const: response
          default: response
        output:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseOutput'
        parallel_tool_calls:
          type: boolean
          default: false
        previous_response_id:
          type: string
        status:
          type: string
        temperature:
          type: number
        top_p:
          type: number
        truncation:
          type: string
        user:
          type: string
      additionalProperties: false
      required:
        - created_at
        - id
        - model
        - object
        - output
        - parallel_tool_calls
        - status
      title: OpenAIResponseObject
    OpenAIResponseOutput:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseOutputMessage'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
      discriminator:
        propertyName: type
        mapping:
          message: '#/components/schemas/OpenAIResponseOutputMessage'
          web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
    OpenAIResponseOutputMessage:
      type: object
      properties:
        id:
          type: string
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
        role:
          type: string
          const: assistant
          default: assistant
        status:
          type: string
        type:
          type: string
          const: message
          default: message
      additionalProperties: false
      required:
        - id
        - content
        - role
        - status
        - type
      title: OpenAIResponseOutputMessage
    OpenAIResponseOutputMessageContent:
      type: object
      properties:
        text:
          type: string
        type:
          type: string
          const: output_text
          default: output_text
      additionalProperties: false
      required:
        - text
        - type
      title: >-
        OpenAIResponseOutputMessageContentOutputText
    "OpenAIResponseOutputMessageWebSearchToolCall":
      type: object
      properties:
        id:
          type: string
        status:
          type: string
        type:
          type: string
          const: web_search_call
          default: web_search_call
      additionalProperties: false
      required:
        - id
        - status
        - type
      title: >-
        OpenAIResponseOutputMessageWebSearchToolCall
    OpenAIResponseObjectStream:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
      discriminator:
        propertyName: type
        mapping:
          response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
    "OpenAIResponseObjectStreamResponseCompleted":
      type: object
      properties:
        response:
          $ref: '#/components/schemas/OpenAIResponseObject'
        type:
          type: string
          const: response.completed
          default: response.completed
      additionalProperties: false
      required:
        - response
        - type
      title: >-
        OpenAIResponseObjectStreamResponseCompleted
    "OpenAIResponseObjectStreamResponseCreated":
      type: object
      properties:
        response:
          $ref: '#/components/schemas/OpenAIResponseObject'
        type:
          type: string
          const: response.created
          default: response.created
      additionalProperties: false
      required:
        - response
        - type
      title: >-
        OpenAIResponseObjectStreamResponseCreated
    CreateUploadSessionRequest:
      type: object
      properties:
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -179,7 +179,7 @@ class ContentBuilder:
        "Creates the content subtree for a request or response."
        def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
        def get_media_type(t):
            if is_generic_list(t):
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@ -18,7 +18,6 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::vllm`, `inline::sentence-transformers` |
 | openai_responses | `inline::openai-responses` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@ -19,7 +19,6 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::together`, `inline::sentence-transformers` |
 | openai_responses | `inline::openai-responses` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -38,6 +38,13 @@ from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 from .openai_responses import (
    OpenAIResponseInputMessage,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
 )
 class Attachment(BaseModel):
    """An attachment to an agent turn.
@ -593,3 +600,39 @@ class Agents(Protocol):
        :returns: A ListAgentSessionsResponse.
        """
        ...
    # We situate the OpenAI Responses API in the Agents API just like we did things
    # for Inference. The Responses API, in its intent, serves the same purpose as
    # the Agents API above -- it is essentially a lightweight "agentic loop" with
    # integrated tool calling.
    #
    # Both of these APIs are inherently stateful.
    @webmethod(route="/openai/v1/responses/{id}", method="GET")
    async def get_openai_response(
        self,
        id: str,
    ) -> OpenAIResponseObject:
        """Retrieve an OpenAI response by its ID.
        :param id: The ID of the OpenAI response to retrieve.
        :returns: An OpenAIResponseObject.
        """
        ...
    @webmethod(route="/openai/v1/responses", method="POST")
    async def create_openai_response(
        self,
        input: Union[str, List[OpenAIResponseInputMessage]],
        model: str,
        previous_response_id: Optional[str] = None,
        store: Optional[bool] = True,
        stream: Optional[bool] = False,
        tools: Optional[List[OpenAIResponseInputTool]] = None,
    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]:
        """Create a new OpenAI response.
        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        """
--- a/llama_stack/apis/openai_responses/openai_responses.py
+++ b/llama_stack/apis/openai_responses/openai_responses.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncIterator, List, Literal, Optional, Protocol, Union, runtime_checkable
+from typing import List, Literal, Optional, Union
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+from llama_stack.schema_utils import json_schema_type, register_schema
@json_schema_type
@ -104,7 +104,7 @@ class OpenAIResponseInputMessageContentText(BaseModel):
@json_schema_type
 class OpenAIResponseInputMessageContentImage(BaseModel):
-    detail: Literal["low", "high", "auto"] = "auto"
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
    # TODO: handle file_id
    image_url: Optional[str] = None
@ -121,13 +121,13 @@ register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMess
@json_schema_type
 class OpenAIResponseInputMessage(BaseModel):
    content: Union[str, List[OpenAIResponseInputMessageContent]]
-    role: Literal["system", "developer", "user", "assistant"]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
    type: Optional[Literal["message"]] = "message"
@json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
-    type: Literal["web_search", "web_search_preview_2025_03_11"] = "web_search"
+    type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
    # TODO: actually use search_context_size somewhere...
    search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$")
    # TODO: add user_location
@ -138,27 +138,3 @@ OpenAIResponseInputTool = Annotated[
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
@runtime_checkable
 class OpenAIResponses(Protocol):
    """
    OpenAI Responses API implementation.
    """
    @webmethod(route="/openai/v1/responses/{id}", method="GET")
    async def get_openai_response(
        self,
        id: str,
    ) -> OpenAIResponseObject: ...
    @webmethod(route="/openai/v1/responses", method="POST")
    async def create_openai_response(
        self,
        input: Union[str, List[OpenAIResponseInputMessage]],
        model: str,
        previous_response_id: Optional[str] = None,
        store: Optional[bool] = True,
        stream: Optional[bool] = False,
        tools: Optional[List[OpenAIResponseInputTool]] = None,
    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]: ...
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -24,7 +24,6 @@ class Api(Enum):
    eval = "eval"
    post_training = "post_training"
    tool_runtime = "tool_runtime"
    openai_responses = "openai_responses"
    telemetry = "telemetry"
--- a/llama_stack/apis/openai_responses/init.py
+++ b/llama_stack/apis/openai_responses/init.py
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .openai_responses import *  # noqa: F401 F403
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -16,7 +16,6 @@ from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.openai_responses.openai_responses import OpenAIResponses
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
@ -81,7 +80,6 @@ def api_protocol_map() -> Dict[Api, Any]:
        Api.tool_groups: ToolGroups,
        Api.tool_runtime: ToolRuntime,
        Api.files: Files,
        Api.openai_responses: OpenAIResponses,
    }
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@ -149,8 +149,6 @@ class CommonRoutingTableImpl(RoutingTable):
                p.benchmark_store = self
            elif api == Api.tool_runtime:
                p.tool_store = self
            elif api == Api.openai_responses:
                p.model_store = self
    async def shutdown(self) -> None:
        for p in self.impls_by_provider_id.values():
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -23,6 +23,9 @@ from llama_stack.apis.agents import (
    Document,
    ListAgentSessionsResponse,
    ListAgentsResponse,
    OpenAIResponseInputMessage,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    Session,
    Turn,
 )
@ -40,6 +43,7 @@ from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_imp
 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
 from .openai_responses import OpenAIResponsesImpl
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@ -63,9 +67,16 @@ class MetaReferenceAgentsImpl(Agents):
        self.tool_groups_api = tool_groups_api
        self.in_memory_store = InmemoryKVStoreImpl()
        self.openai_responses_impl = None
    async def initialize(self) -> None:
        self.persistence_store = await kvstore_impl(self.config.persistence_store)
        self.openai_responses_impl = OpenAIResponsesImpl(
            self.persistence_store,
            inference_api=self.inference_api,
            tool_groups_api=self.tool_groups_api,
            tool_runtime_api=self.tool_runtime_api,
        )
        # check if "bwrap" is available
        if not shutil.which("bwrap"):
@ -244,3 +255,23 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
    ) -> ListAgentSessionsResponse:
        pass
    # OpenAI responses
    async def get_openai_response(
        self,
        id: str,
    ) -> OpenAIResponseObject:
        return await self.openai_responses_impl.get_openai_response(id)
    async def create_openai_response(
        self,
        input: Union[str, List[OpenAIResponseInputMessage]],
        model: str,
        previous_response_id: Optional[str] = None,
        store: Optional[bool] = True,
        stream: Optional[bool] = False,
        tools: Optional[List[OpenAIResponseInputTool]] = None,
    ) -> OpenAIResponseObject:
        return await self.openai_responses_impl.create_openai_response(
            input, model, previous_response_id, store, stream, tools
        )
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -10,6 +10,20 @@ from typing import AsyncIterator, List, Optional, Union, cast
 from openai.types.chat import ChatCompletionToolParam
 from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseInputMessage,
    OpenAIResponseInputMessageContentImage,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
    OpenAIResponseObjectStreamResponseCompleted,
    OpenAIResponseObjectStreamResponseCreated,
    OpenAIResponseOutput,
    OpenAIResponseOutputMessage,
    OpenAIResponseOutputMessageContentOutputText,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.inference.inference import (
    Inference,
    OpenAIAssistantMessageParam,
@ -24,29 +38,11 @@ from llama_stack.apis.inference.inference import (
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
 )
 from llama_stack.apis.models.models import Models, ModelType
 from llama_stack.apis.openai_responses import OpenAIResponses
 from llama_stack.apis.openai_responses.openai_responses import (
    OpenAIResponseInputMessage,
    OpenAIResponseInputMessageContentImage,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
    OpenAIResponseObjectStreamResponseCompleted,
    OpenAIResponseObjectStreamResponseCreated,
    OpenAIResponseOutput,
    OpenAIResponseOutputMessage,
    OpenAIResponseOutputMessageContentOutputText,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.tools.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
 from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.kvstore import KVStore
 from .config import OpenAIResponsesImplConfig
 logger = get_logger(name=__name__, category="openai_responses")
@ -80,34 +76,25 @@ async def _openai_choices_to_output_messages(choices: List[OpenAIChoice]) -> Lis
    return output_messages
-class OpenAIResponsesImpl(OpenAIResponses):
+class OpenAIResponsesImpl:
    def __init__(
        self,
-        config: OpenAIResponsesImplConfig,
+        persistence_store: KVStore,
        models_api: Models,
        inference_api: Inference,
        tool_groups_api: ToolGroups,
        tool_runtime_api: ToolRuntime,
    ):
-        self.config = config
+        self.persistence_store = persistence_store
        self.models_api = models_api
        self.inference_api = inference_api
        self.tool_groups_api = tool_groups_api
        self.tool_runtime_api = tool_runtime_api
    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.kvstore)
    async def shutdown(self) -> None:
        logger.debug("OpenAIResponsesImpl.shutdown")
        pass
    async def get_openai_response(
        self,
        id: str,
    ) -> OpenAIResponseObject:
        key = f"{OPENAI_RESPONSES_PREFIX}{id}"
-        response_json = await self.kvstore.get(key=key)
+        response_json = await self.persistence_store.get(key=key)
        if response_json is None:
            raise ValueError(f"OpenAI response with id '{id}' not found")
        return OpenAIResponseObject.model_validate_json(response_json)
@ -122,11 +109,6 @@ class OpenAIResponsesImpl(OpenAIResponses):
        tools: Optional[List[OpenAIResponseInputTool]] = None,
    ):
        stream = False if stream is None else stream
        model_obj = await self.models_api.get_model(model)
        if model_obj is None:
            raise ValueError(f"Model '{model}' not found")
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
        messages: List[OpenAIMessageParam] = []
        if previous_response_id:
@ -155,7 +137,7 @@ class OpenAIResponsesImpl(OpenAIResponses):
        chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
        chat_response = await self.inference_api.openai_chat_completion(
-            model=model_obj.identifier,
+            model=model,
            messages=messages,
            tools=chat_tools,
            stream=stream,
@ -198,14 +180,14 @@ class OpenAIResponsesImpl(OpenAIResponses):
        output_messages: List[OpenAIResponseOutput] = []
        if chat_response.choices[0].finish_reason == "tool_calls":
            output_messages.extend(
-                await self._execute_tool_and_return_final_output(model_obj.identifier, stream, chat_response, messages)
+                await self._execute_tool_and_return_final_output(model, stream, chat_response, messages)
            )
        else:
            output_messages.extend(await _openai_choices_to_output_messages(chat_response.choices))
        response = OpenAIResponseObject(
            created_at=chat_response.created,
            id=f"resp-{uuid.uuid4()}",
-            model=model_obj.identifier,
+            model=model,
            object="response",
            status="completed",
            output=output_messages,
@ -214,7 +196,7 @@ class OpenAIResponsesImpl(OpenAIResponses):
        if store:
            # Store in kvstore
            key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
-            await self.kvstore.set(
+            await self.persistence_store.set(
                key=key,
                value=response.model_dump_json(),
            )
--- a/llama_stack/providers/inline/openai_responses/init.py
+++ b/llama_stack/providers/inline/openai_responses/init.py
@ -1,21 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict
 from llama_stack.apis.datatypes import Api
 from .config import OpenAIResponsesImplConfig
 async def get_provider_impl(config: OpenAIResponsesImplConfig, deps: Dict[Api, Any]):
    from .openai_responses import OpenAIResponsesImpl
    impl = OpenAIResponsesImpl(
        config, deps[Api.models], deps[Api.inference], deps[Api.tool_groups], deps[Api.tool_runtime]
    )
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/openai_responses/config.py
+++ b/llama_stack/providers/inline/openai_responses/config.py
@ -1,24 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict
 from pydantic import BaseModel
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 class OpenAIResponsesImplConfig(BaseModel):
    kvstore: KVStoreConfig
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
        return {
            "kvstore": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
                db_name="openai_responses.db",
            )
        }
--- a/llama_stack/providers/registry/openai_responses.py
+++ b/llama_stack/providers/registry/openai_responses.py
@ -1,27 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import List
 from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
 def available_providers() -> List[ProviderSpec]:
    return [
        InlineProviderSpec(
            api=Api.openai_responses,
            provider_type="inline::openai-responses",
            pip_packages=[],
            module="llama_stack.providers.inline.openai_responses",
            config_class="llama_stack.providers.inline.openai_responses.config.OpenAIResponsesImplConfig",
            api_dependencies=[
                Api.models,
                Api.inference,
                Api.tool_groups,
                Api.tool_runtime,
            ],
        ),
    ]
--- a/llama_stack/strong_typing/schema.py
+++ b/llama_stack/strong_typing/schema.py
@ -478,6 +478,8 @@ class JsonSchemaGenerator:
                }
            return ret
        elif origin_type is Literal:
            if len(typing.get_args(typ)) != 1:
                print(f"Literal type {typ} has {len(typing.get_args(typ))} arguments")
            (literal_value,) = typing.get_args(typ)  # unpack value of literal type
            schema = self.type_to_schema(type(literal_value))
            schema["const"] = literal_value
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -24,8 +24,6 @@ distribution_spec:
    - inline::braintrust
    telemetry:
    - inline::meta-reference
    openai_responses:
    - inline::openai-responses
    tool_runtime:
    - remote::brave-search
    - remote::tavily-search
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
 - openai_responses
 - safety
 - scoring
 - telemetry
@ -92,14 +91,6 @@ providers:
      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
  openai_responses:
  - provider_id: openai-responses
    provider_type: inline::openai-responses
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
 - openai_responses
 - safety
 - scoring
 - telemetry
@ -85,14 +84,6 @@ providers:
      service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
  openai_responses:
  - provider_id: openai-responses
    provider_type: inline::openai-responses
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -31,7 +31,6 @@ def get_distribution_template() -> DistributionTemplate:
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
        "telemetry": ["inline::meta-reference"],
        "openai_responses": ["inline::openai-responses"],
        "tool_runtime": [
            "remote::brave-search",
            "remote::tavily-search",
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@ -24,8 +24,6 @@ distribution_spec:
    - inline::basic
    - inline::llm-as-judge
    - inline::braintrust
    openai_responses:
    - inline::openai-responses
    tool_runtime:
    - remote::brave-search
    - remote::tavily-search
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
 - openai_responses
 - safety
 - scoring
 - telemetry
@ -88,14 +87,6 @@ providers:
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:}
  openai_responses:
  - provider_id: openai-responses
    provider_type: inline::openai-responses
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/openai_responses.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
 - openai_responses
 - safety
 - scoring
 - telemetry
@ -83,14 +82,6 @@ providers:
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:}
  openai_responses:
  - provider_id: openai-responses
    provider_type: inline::openai-responses
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/openai_responses.db
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@ -36,7 +36,6 @@ def get_distribution_template() -> DistributionTemplate:
        "eval": ["inline::meta-reference"],
        "datasetio": ["remote::huggingface", "inline::localfs"],
        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
        "openai_responses": ["inline::openai-responses"],
        "tool_runtime": [
            "remote::brave-search",
            "remote::tavily-search",