From abd6280cb8772545193560772a0d94f7323ee629 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Mon, 28 Apr 2025 10:27:28 -0700
Subject: [PATCH] fold openai responses into the Agents API

---
 docs/_static/llama-stack-spec.html            | 512 ++++++++++++++++++
 docs/_static/llama-stack-spec.yaml            | 350 ++++++++++++
 docs/openapi_generator/pyopenapi/generator.py |   2 +-
 .../self_hosted_distro/remote-vllm.md         |   1 -
 .../self_hosted_distro/together.md            |   1 -
 llama_stack/apis/agents/agents.py             |  43 ++
 .../openai_responses.py                       |  34 +-
 llama_stack/apis/datatypes.py                 |   1 -
 llama_stack/apis/openai_responses/__init__.py |   7 -
 llama_stack/distribution/resolver.py          |   2 -
 .../distribution/routers/routing_tables.py    |   2 -
 .../inline/agents/meta_reference/agents.py    |  31 ++
 .../meta_reference}/openai_responses.py       |  64 +--
 .../inline/openai_responses/__init__.py       |  21 -
 .../inline/openai_responses/config.py         |  24 -
 .../providers/registry/openai_responses.py    |  27 -
 llama_stack/strong_typing/schema.py           |   2 +
 llama_stack/templates/remote-vllm/build.yaml  |   2 -
 .../remote-vllm/run-with-safety.yaml          |   9 -
 llama_stack/templates/remote-vllm/run.yaml    |   9 -
 llama_stack/templates/remote-vllm/vllm.py     |   1 -
 llama_stack/templates/together/build.yaml     |   2 -
 .../templates/together/run-with-safety.yaml   |   9 -
 llama_stack/templates/together/run.yaml       |   9 -
 llama_stack/templates/together/together.py    |   1 -
 25 files changed, 967 insertions(+), 199 deletions(-)
 rename llama_stack/apis/{openai_responses => agents}/openai_responses.py (78%)
 delete mode 100644 llama_stack/apis/openai_responses/__init__.py
 rename llama_stack/providers/inline/{openai_responses => agents/meta_reference}/openai_responses.py (91%)
 delete mode 100644 llama_stack/providers/inline/openai_responses/__init__.py
 delete mode 100644 llama_stack/providers/inline/openai_responses/config.py
 delete mode 100644 llama_stack/providers/registry/openai_responses.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 4c5393947..49c402d37 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -497,6 +497,54 @@
                 }
             }
         },
+        "/v1/openai/v1/responses": {
+            "post": {
+                "responses": {
+                    "200": {
+                        "description": "Runtime representation of an annotated type.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObject"
+                                }
+                            },
+                            "text/event-stream": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObjectStream"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Create a new OpenAI response.",
+                "parameters": [],
+                "requestBody": {
+                    "content": {
+                        "application/json": {
+                            "schema": {
+                                "$ref": "#/components/schemas/CreateOpenaiResponseRequest"
+                            }
+                        }
+                    },
+                    "required": true
+                }
+            }
+        },
         "/v1/files": {
             "get": {
                 "responses": {
@@ -1278,6 +1326,49 @@
                 ]
             }
         },
+        "/v1/openai/v1/responses/{id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An OpenAIResponseObject.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/OpenAIResponseObject"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Retrieve an OpenAI response by its ID.",
+                "parameters": [
+                    {
+                        "name": "id",
+                        "in": "path",
+                        "description": "The ID of the OpenAI response to retrieve.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/scoring-functions/{scoring_fn_id}": {
             "get": {
                 "responses": {
@@ -6192,6 +6283,427 @@
                 ],
                 "title": "AgentTurnResponseTurnStartPayload"
             },
+            "OpenAIResponseInputMessage": {
+                "type": "object",
+                "properties": {
+                    "content": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
+                                }
+                            }
+                        ]
+                    },
+                    "role": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "system"
+                            },
+                            {
+                                "type": "string",
+                                "const": "developer"
+                            },
+                            {
+                                "type": "string",
+                                "const": "user"
+                            },
+                            {
+                                "type": "string",
+                                "const": "assistant"
+                            }
+                        ]
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "message",
+                        "default": "message"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "content",
+                    "role"
+                ],
+                "title": "OpenAIResponseInputMessage"
+            },
+            "OpenAIResponseInputMessageContent": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentText"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
+                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                    }
+                }
+            },
+            "OpenAIResponseInputMessageContentImage": {
+                "type": "object",
+                "properties": {
+                    "detail": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "low"
+                            },
+                            {
+                                "type": "string",
+                                "const": "high"
+                            },
+                            {
+                                "type": "string",
+                                "const": "auto"
+                            }
+                        ],
+                        "default": "auto"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "input_image",
+                        "default": "input_image"
+                    },
+                    "image_url": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "detail",
+                    "type"
+                ],
+                "title": "OpenAIResponseInputMessageContentImage"
+            },
+            "OpenAIResponseInputMessageContentText": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "input_text",
+                        "default": "input_text"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "OpenAIResponseInputMessageContentText"
+            },
+            "OpenAIResponseInputTool": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "oneOf": [
+                            {
+                                "type": "string",
+                                "const": "web_search"
+                            },
+                            {
+                                "type": "string",
+                                "const": "web_search_preview_2025_03_11"
+                            }
+                        ],
+                        "default": "web_search"
+                    },
+                    "search_context_size": {
+                        "type": "string",
+                        "default": "medium"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "OpenAIResponseInputToolWebSearch"
+            },
+            "CreateOpenaiResponseRequest": {
+                "type": "object",
+                "properties": {
+                    "input": {
+                        "oneOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/OpenAIResponseInputMessage"
+                                }
+                            }
+                        ],
+                        "description": "Input message(s) to create the response."
+                    },
+                    "model": {
+                        "type": "string",
+                        "description": "The underlying LLM used for completions."
+                    },
+                    "previous_response_id": {
+                        "type": "string",
+                        "description": "(Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses."
+                    },
+                    "store": {
+                        "type": "boolean"
+                    },
+                    "stream": {
+                        "type": "boolean"
+                    },
+                    "tools": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "input",
+                    "model"
+                ],
+                "title": "CreateOpenaiResponseRequest"
+            },
+            "OpenAIResponseError": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string"
+                    },
+                    "message": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "code",
+                    "message"
+                ],
+                "title": "OpenAIResponseError"
+            },
+            "OpenAIResponseObject": {
+                "type": "object",
+                "properties": {
+                    "created_at": {
+                        "type": "integer"
+                    },
+                    "error": {
+                        "$ref": "#/components/schemas/OpenAIResponseError"
+                    },
+                    "id": {
+                        "type": "string"
+                    },
+                    "model": {
+                        "type": "string"
+                    },
+                    "object": {
+                        "type": "string",
+                        "const": "response",
+                        "default": "response"
+                    },
+                    "output": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseOutput"
+                        }
+                    },
+                    "parallel_tool_calls": {
+                        "type": "boolean",
+                        "default": false
+                    },
+                    "previous_response_id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "temperature": {
+                        "type": "number"
+                    },
+                    "top_p": {
+                        "type": "number"
+                    },
+                    "truncation": {
+                        "type": "string"
+                    },
+                    "user": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "created_at",
+                    "id",
+                    "model",
+                    "object",
+                    "output",
+                    "parallel_tool_calls",
+                    "status"
+                ],
+                "title": "OpenAIResponseObject"
+            },
+            "OpenAIResponseOutput": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessage"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "message": "#/components/schemas/OpenAIResponseOutputMessage",
+                        "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                    }
+                }
+            },
+            "OpenAIResponseOutputMessage": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "content": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
+                        }
+                    },
+                    "role": {
+                        "type": "string",
+                        "const": "assistant",
+                        "default": "assistant"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "message",
+                        "default": "message"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "content",
+                    "role",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessage"
+            },
+            "OpenAIResponseOutputMessageContent": {
+                "type": "object",
+                "properties": {
+                    "text": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "output_text",
+                        "default": "output_text"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "text",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageContentOutputText"
+            },
+            "OpenAIResponseOutputMessageWebSearchToolCall": {
+                "type": "object",
+                "properties": {
+                    "id": {
+                        "type": "string"
+                    },
+                    "status": {
+                        "type": "string"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "web_search_call",
+                        "default": "web_search_call"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "id",
+                    "status",
+                    "type"
+                ],
+                "title": "OpenAIResponseOutputMessageWebSearchToolCall"
+            },
+            "OpenAIResponseObjectStream": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
+                    },
+                    {
+                        "$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
+                        "response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
+                    }
+                }
+            },
+            "OpenAIResponseObjectStreamResponseCompleted": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "$ref": "#/components/schemas/OpenAIResponseObject"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.completed",
+                        "default": "response.completed"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseCompleted"
+            },
+            "OpenAIResponseObjectStreamResponseCreated": {
+                "type": "object",
+                "properties": {
+                    "response": {
+                        "$ref": "#/components/schemas/OpenAIResponseObject"
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "response.created",
+                        "default": "response.created"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "response",
+                    "type"
+                ],
+                "title": "OpenAIResponseObjectStreamResponseCreated"
+            },
             "CreateUploadSessionRequest": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index a24f1a9db..e5bfad623 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -330,6 +330,39 @@ paths:
             schema:
               $ref: '#/components/schemas/CreateAgentTurnRequest'
         required: true
+  /v1/openai/v1/responses:
+    post:
+      responses:
+        '200':
+          description: >-
+            Runtime representation of an annotated type.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObject'
+            text/event-stream:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObjectStream'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Create a new OpenAI response.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
+        required: true
   /v1/files:
     get:
       responses:
@@ -875,6 +908,36 @@ paths:
           required: true
           schema:
             type: string
+  /v1/openai/v1/responses/{id}:
+    get:
+      responses:
+        '200':
+          description: An OpenAIResponseObject.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAIResponseObject'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Retrieve an OpenAI response by its ID.
+      parameters:
+        - name: id
+          in: path
+          description: >-
+            The ID of the OpenAI response to retrieve.
+          required: true
+          schema:
+            type: string
   /v1/scoring-functions/{scoring_fn_id}:
     get:
       responses:
@@ -4329,6 +4392,293 @@ components:
         - event_type
         - turn_id
       title: AgentTurnResponseTurnStartPayload
+    OpenAIResponseInputMessage:
+      type: object
+      properties:
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
+        role:
+          oneOf:
+            - type: string
+              const: system
+            - type: string
+              const: developer
+            - type: string
+              const: user
+            - type: string
+              const: assistant
+        type:
+          type: string
+          const: message
+          default: message
+      additionalProperties: false
+      required:
+        - content
+        - role
+      title: OpenAIResponseInputMessage
+    OpenAIResponseInputMessageContent:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
+        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
+      discriminator:
+        propertyName: type
+        mapping:
+          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
+          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
+    OpenAIResponseInputMessageContentImage:
+      type: object
+      properties:
+        detail:
+          oneOf:
+            - type: string
+              const: low
+            - type: string
+              const: high
+            - type: string
+              const: auto
+          default: auto
+        type:
+          type: string
+          const: input_image
+          default: input_image
+        image_url:
+          type: string
+      additionalProperties: false
+      required:
+        - detail
+        - type
+      title: OpenAIResponseInputMessageContentImage
+    OpenAIResponseInputMessageContentText:
+      type: object
+      properties:
+        text:
+          type: string
+        type:
+          type: string
+          const: input_text
+          default: input_text
+      additionalProperties: false
+      required:
+        - text
+        - type
+      title: OpenAIResponseInputMessageContentText
+    OpenAIResponseInputTool:
+      type: object
+      properties:
+        type:
+          oneOf:
+            - type: string
+              const: web_search
+            - type: string
+              const: web_search_preview_2025_03_11
+          default: web_search
+        search_context_size:
+          type: string
+          default: medium
+      additionalProperties: false
+      required:
+        - type
+      title: OpenAIResponseInputToolWebSearch
+    CreateOpenaiResponseRequest:
+      type: object
+      properties:
+        input:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIResponseInputMessage'
+          description: Input message(s) to create the response.
+        model:
+          type: string
+          description: The underlying LLM used for completions.
+        previous_response_id:
+          type: string
+          description: >-
+            (Optional) if specified, the new response will be a continuation of the
+            previous response. This can be used to easily fork-off new responses from
+            existing responses.
+        store:
+          type: boolean
+        stream:
+          type: boolean
+        tools:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseInputTool'
+      additionalProperties: false
+      required:
+        - input
+        - model
+      title: CreateOpenaiResponseRequest
+    OpenAIResponseError:
+      type: object
+      properties:
+        code:
+          type: string
+        message:
+          type: string
+      additionalProperties: false
+      required:
+        - code
+        - message
+      title: OpenAIResponseError
+    OpenAIResponseObject:
+      type: object
+      properties:
+        created_at:
+          type: integer
+        error:
+          $ref: '#/components/schemas/OpenAIResponseError'
+        id:
+          type: string
+        model:
+          type: string
+        object:
+          type: string
+          const: response
+          default: response
+        output:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseOutput'
+        parallel_tool_calls:
+          type: boolean
+          default: false
+        previous_response_id:
+          type: string
+        status:
+          type: string
+        temperature:
+          type: number
+        top_p:
+          type: number
+        truncation:
+          type: string
+        user:
+          type: string
+      additionalProperties: false
+      required:
+        - created_at
+        - id
+        - model
+        - object
+        - output
+        - parallel_tool_calls
+        - status
+      title: OpenAIResponseObject
+    OpenAIResponseOutput:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessage'
+        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+      discriminator:
+        propertyName: type
+        mapping:
+          message: '#/components/schemas/OpenAIResponseOutputMessage'
+          web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+    OpenAIResponseOutputMessage:
+      type: object
+      properties:
+        id:
+          type: string
+        content:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
+        role:
+          type: string
+          const: assistant
+          default: assistant
+        status:
+          type: string
+        type:
+          type: string
+          const: message
+          default: message
+      additionalProperties: false
+      required:
+        - id
+        - content
+        - role
+        - status
+        - type
+      title: OpenAIResponseOutputMessage
+    OpenAIResponseOutputMessageContent:
+      type: object
+      properties:
+        text:
+          type: string
+        type:
+          type: string
+          const: output_text
+          default: output_text
+      additionalProperties: false
+      required:
+        - text
+        - type
+      title: >-
+        OpenAIResponseOutputMessageContentOutputText
+    "OpenAIResponseOutputMessageWebSearchToolCall":
+      type: object
+      properties:
+        id:
+          type: string
+        status:
+          type: string
+        type:
+          type: string
+          const: web_search_call
+          default: web_search_call
+      additionalProperties: false
+      required:
+        - id
+        - status
+        - type
+      title: >-
+        OpenAIResponseOutputMessageWebSearchToolCall
+    OpenAIResponseObjectStream:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+        - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
+      discriminator:
+        propertyName: type
+        mapping:
+          response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
+          response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
+    "OpenAIResponseObjectStreamResponseCompleted":
+      type: object
+      properties:
+        response:
+          $ref: '#/components/schemas/OpenAIResponseObject'
+        type:
+          type: string
+          const: response.completed
+          default: response.completed
+      additionalProperties: false
+      required:
+        - response
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseCompleted
+    "OpenAIResponseObjectStreamResponseCreated":
+      type: object
+      properties:
+        response:
+          $ref: '#/components/schemas/OpenAIResponseObject'
+        type:
+          type: string
+          const: response.created
+          default: response.created
+      additionalProperties: false
+      required:
+        - response
+        - type
+      title: >-
+        OpenAIResponseObjectStreamResponseCreated
     CreateUploadSessionRequest:
       type: object
       properties:
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 3936bb3c4..6d5e48a46 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -179,7 +179,7 @@ class ContentBuilder:
         "Creates the content subtree for a request or response."
 
         def is_iterator_type(t):
-            return "StreamChunk" in str(t)
+            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
 
         def get_media_type(t):
             if is_generic_list(t):
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index 74365722d..46df56008 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -18,7 +18,6 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::vllm`, `inline::sentence-transformers` |
-| openai_responses | `inline::openai-responses` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index 5da0ee980..3ebb1f59e 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -19,7 +19,6 @@ The `llamastack/distribution-together` distribution consists of the following pr
 | datasetio | `remote::huggingface`, `inline::localfs` |
 | eval | `inline::meta-reference` |
 | inference | `remote::together`, `inline::sentence-transformers` |
-| openai_responses | `inline::openai-responses` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index dec43280b..4db6e2226 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -38,6 +38,13 @@ from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 
+from .openai_responses import (
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+)
+
 
 class Attachment(BaseModel):
     """An attachment to an agent turn.
@@ -593,3 +600,39 @@ class Agents(Protocol):
         :returns: A ListAgentSessionsResponse.
         """
         ...
+
+    # We situate the OpenAI Responses API in the Agents API just like we did things
+    # for Inference. The Responses API, in its intent, serves the same purpose as
+    # the Agents API above -- it is essentially a lightweight "agentic loop" with
+    # integrated tool calling.
+    #
+    # Both of these APIs are inherently stateful.
+
+    @webmethod(route="/openai/v1/responses/{id}", method="GET")
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        """Retrieve an OpenAI response by its ID.
+
+        :param id: The ID of the OpenAI response to retrieve.
+        :returns: An OpenAIResponseObject.
+        """
+        ...
+
+    @webmethod(route="/openai/v1/responses", method="POST")
+    async def create_openai_response(
+        self,
+        input: Union[str, List[OpenAIResponseInputMessage]],
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+        tools: Optional[List[OpenAIResponseInputTool]] = None,
+    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]:
+        """Create a new OpenAI response.
+
+        :param input: Input message(s) to create the response.
+        :param model: The underlying LLM used for completions.
+        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
+        """
diff --git a/llama_stack/apis/openai_responses/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
similarity index 78%
rename from llama_stack/apis/openai_responses/openai_responses.py
rename to llama_stack/apis/agents/openai_responses.py
index 0b21f3f28..72f16e224 100644
--- a/llama_stack/apis/openai_responses/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import AsyncIterator, List, Literal, Optional, Protocol, Union, runtime_checkable
+from typing import List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+from llama_stack.schema_utils import json_schema_type, register_schema
 
 
 @json_schema_type
@@ -104,7 +104,7 @@ class OpenAIResponseInputMessageContentText(BaseModel):
 
 @json_schema_type
 class OpenAIResponseInputMessageContentImage(BaseModel):
-    detail: Literal["low", "high", "auto"] = "auto"
+    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
     type: Literal["input_image"] = "input_image"
     # TODO: handle file_id
     image_url: Optional[str] = None
@@ -121,13 +121,13 @@ register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMess
 @json_schema_type
 class OpenAIResponseInputMessage(BaseModel):
     content: Union[str, List[OpenAIResponseInputMessageContent]]
-    role: Literal["system", "developer", "user", "assistant"]
+    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
     type: Optional[Literal["message"]] = "message"
 
 
 @json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
-    type: Literal["web_search", "web_search_preview_2025_03_11"] = "web_search"
+    type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
     # TODO: actually use search_context_size somewhere...
     search_context_size: Optional[str] = Field(default="medium", pattern="^low|medium|high$")
     # TODO: add user_location
@@ -138,27 +138,3 @@ OpenAIResponseInputTool = Annotated[
     Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
-
-
-@runtime_checkable
-class OpenAIResponses(Protocol):
-    """
-    OpenAI Responses API implementation.
-    """
-
-    @webmethod(route="/openai/v1/responses/{id}", method="GET")
-    async def get_openai_response(
-        self,
-        id: str,
-    ) -> OpenAIResponseObject: ...
-
-    @webmethod(route="/openai/v1/responses", method="POST")
-    async def create_openai_response(
-        self,
-        input: Union[str, List[OpenAIResponseInputMessage]],
-        model: str,
-        previous_response_id: Optional[str] = None,
-        store: Optional[bool] = True,
-        stream: Optional[bool] = False,
-        tools: Optional[List[OpenAIResponseInputTool]] = None,
-    ) -> Union[OpenAIResponseObject, AsyncIterator[OpenAIResponseObjectStream]]: ...
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 85c0ecc6b..25f3ab1ab 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -24,7 +24,6 @@ class Api(Enum):
     eval = "eval"
     post_training = "post_training"
     tool_runtime = "tool_runtime"
-    openai_responses = "openai_responses"
 
     telemetry = "telemetry"
 
diff --git a/llama_stack/apis/openai_responses/__init__.py b/llama_stack/apis/openai_responses/__init__.py
deleted file mode 100644
index a3b32ff71..000000000
--- a/llama_stack/apis/openai_responses/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .openai_responses import *  # noqa: F401 F403
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 25c91fca1..e9a594eba 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -16,7 +16,6 @@ from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
-from llama_stack.apis.openai_responses.openai_responses import OpenAIResponses
 from llama_stack.apis.post_training import PostTraining
 from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
@@ -81,7 +80,6 @@ def api_protocol_map() -> Dict[Api, Any]:
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
         Api.files: Files,
-        Api.openai_responses: OpenAIResponses,
     }
 
 
diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py
index 50416f338..18b0c891f 100644
--- a/llama_stack/distribution/routers/routing_tables.py
+++ b/llama_stack/distribution/routers/routing_tables.py
@@ -149,8 +149,6 @@ class CommonRoutingTableImpl(RoutingTable):
                 p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
-            elif api == Api.openai_responses:
-                p.model_store = self
 
     async def shutdown(self) -> None:
         for p in self.impls_by_provider_id.values():
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index 656178773..38aa6fd97 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -23,6 +23,9 @@ from llama_stack.apis.agents import (
     Document,
     ListAgentSessionsResponse,
     ListAgentsResponse,
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
     Session,
     Turn,
 )
@@ -40,6 +43,7 @@ from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_imp
 
 from .agent_instance import ChatAgent
 from .config import MetaReferenceAgentsImplConfig
+from .openai_responses import OpenAIResponsesImpl
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@@ -63,9 +67,16 @@ class MetaReferenceAgentsImpl(Agents):
         self.tool_groups_api = tool_groups_api
 
         self.in_memory_store = InmemoryKVStoreImpl()
+        self.openai_responses_impl = None
 
     async def initialize(self) -> None:
         self.persistence_store = await kvstore_impl(self.config.persistence_store)
+        self.openai_responses_impl = OpenAIResponsesImpl(
+            self.persistence_store,
+            inference_api=self.inference_api,
+            tool_groups_api=self.tool_groups_api,
+            tool_runtime_api=self.tool_runtime_api,
+        )
 
         # check if "bwrap" is available
         if not shutil.which("bwrap"):
@@ -244,3 +255,23 @@ class MetaReferenceAgentsImpl(Agents):
         agent_id: str,
     ) -> ListAgentSessionsResponse:
         pass
+
+    # OpenAI responses
+    async def get_openai_response(
+        self,
+        id: str,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.get_openai_response(id)
+
+    async def create_openai_response(
+        self,
+        input: Union[str, List[OpenAIResponseInputMessage]],
+        model: str,
+        previous_response_id: Optional[str] = None,
+        store: Optional[bool] = True,
+        stream: Optional[bool] = False,
+        tools: Optional[List[OpenAIResponseInputTool]] = None,
+    ) -> OpenAIResponseObject:
+        return await self.openai_responses_impl.create_openai_response(
+            input, model, previous_response_id, store, stream, tools
+        )
diff --git a/llama_stack/providers/inline/openai_responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
similarity index 91%
rename from llama_stack/providers/inline/openai_responses/openai_responses.py
rename to llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index c7d767f73..db1e32f8b 100644
--- a/llama_stack/providers/inline/openai_responses/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -10,6 +10,20 @@ from typing import AsyncIterator, List, Optional, Union, cast
 
 from openai.types.chat import ChatCompletionToolParam
 
+from llama_stack.apis.agents.openai_responses import (
+    OpenAIResponseInputMessage,
+    OpenAIResponseInputMessageContentImage,
+    OpenAIResponseInputMessageContentText,
+    OpenAIResponseInputTool,
+    OpenAIResponseObject,
+    OpenAIResponseObjectStream,
+    OpenAIResponseObjectStreamResponseCompleted,
+    OpenAIResponseObjectStreamResponseCreated,
+    OpenAIResponseOutput,
+    OpenAIResponseOutputMessage,
+    OpenAIResponseOutputMessageContentOutputText,
+    OpenAIResponseOutputMessageWebSearchToolCall,
+)
 from llama_stack.apis.inference.inference import (
     Inference,
     OpenAIAssistantMessageParam,
@@ -24,29 +38,11 @@ from llama_stack.apis.inference.inference import (
     OpenAIToolMessageParam,
     OpenAIUserMessageParam,
 )
-from llama_stack.apis.models.models import Models, ModelType
-from llama_stack.apis.openai_responses import OpenAIResponses
-from llama_stack.apis.openai_responses.openai_responses import (
-    OpenAIResponseInputMessage,
-    OpenAIResponseInputMessageContentImage,
-    OpenAIResponseInputMessageContentText,
-    OpenAIResponseInputTool,
-    OpenAIResponseObject,
-    OpenAIResponseObjectStream,
-    OpenAIResponseObjectStreamResponseCompleted,
-    OpenAIResponseObjectStreamResponseCreated,
-    OpenAIResponseOutput,
-    OpenAIResponseOutputMessage,
-    OpenAIResponseOutputMessageContentOutputText,
-    OpenAIResponseOutputMessageWebSearchToolCall,
-)
 from llama_stack.apis.tools.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
 from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool
-from llama_stack.providers.utils.kvstore import kvstore_impl
-
-from .config import OpenAIResponsesImplConfig
+from llama_stack.providers.utils.kvstore import KVStore
 
 logger = get_logger(name=__name__, category="openai_responses")
 
@@ -80,34 +76,25 @@ async def _openai_choices_to_output_messages(choices: List[OpenAIChoice]) -> Lis
     return output_messages
 
 
-class OpenAIResponsesImpl(OpenAIResponses):
+class OpenAIResponsesImpl:
     def __init__(
         self,
-        config: OpenAIResponsesImplConfig,
-        models_api: Models,
+        persistence_store: KVStore,
         inference_api: Inference,
         tool_groups_api: ToolGroups,
         tool_runtime_api: ToolRuntime,
     ):
-        self.config = config
-        self.models_api = models_api
+        self.persistence_store = persistence_store
         self.inference_api = inference_api
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
 
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-
-    async def shutdown(self) -> None:
-        logger.debug("OpenAIResponsesImpl.shutdown")
-        pass
-
     async def get_openai_response(
         self,
         id: str,
     ) -> OpenAIResponseObject:
         key = f"{OPENAI_RESPONSES_PREFIX}{id}"
-        response_json = await self.kvstore.get(key=key)
+        response_json = await self.persistence_store.get(key=key)
         if response_json is None:
             raise ValueError(f"OpenAI response with id '{id}' not found")
         return OpenAIResponseObject.model_validate_json(response_json)
@@ -122,11 +109,6 @@ class OpenAIResponsesImpl(OpenAIResponses):
         tools: Optional[List[OpenAIResponseInputTool]] = None,
     ):
         stream = False if stream is None else stream
-        model_obj = await self.models_api.get_model(model)
-        if model_obj is None:
-            raise ValueError(f"Model '{model}' not found")
-        if model_obj.model_type == ModelType.embedding:
-            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
 
         messages: List[OpenAIMessageParam] = []
         if previous_response_id:
@@ -155,7 +137,7 @@ class OpenAIResponsesImpl(OpenAIResponses):
 
         chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
         chat_response = await self.inference_api.openai_chat_completion(
-            model=model_obj.identifier,
+            model=model,
             messages=messages,
             tools=chat_tools,
             stream=stream,
@@ -198,14 +180,14 @@ class OpenAIResponsesImpl(OpenAIResponses):
         output_messages: List[OpenAIResponseOutput] = []
         if chat_response.choices[0].finish_reason == "tool_calls":
             output_messages.extend(
-                await self._execute_tool_and_return_final_output(model_obj.identifier, stream, chat_response, messages)
+                await self._execute_tool_and_return_final_output(model, stream, chat_response, messages)
             )
         else:
             output_messages.extend(await _openai_choices_to_output_messages(chat_response.choices))
         response = OpenAIResponseObject(
             created_at=chat_response.created,
             id=f"resp-{uuid.uuid4()}",
-            model=model_obj.identifier,
+            model=model,
             object="response",
             status="completed",
             output=output_messages,
@@ -214,7 +196,7 @@ class OpenAIResponsesImpl(OpenAIResponses):
         if store:
             # Store in kvstore
             key = f"{OPENAI_RESPONSES_PREFIX}{response.id}"
-            await self.kvstore.set(
+            await self.persistence_store.set(
                 key=key,
                 value=response.model_dump_json(),
             )
diff --git a/llama_stack/providers/inline/openai_responses/__init__.py b/llama_stack/providers/inline/openai_responses/__init__.py
deleted file mode 100644
index 76f15d478..000000000
--- a/llama_stack/providers/inline/openai_responses/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict
-
-from llama_stack.apis.datatypes import Api
-
-from .config import OpenAIResponsesImplConfig
-
-
-async def get_provider_impl(config: OpenAIResponsesImplConfig, deps: Dict[Api, Any]):
-    from .openai_responses import OpenAIResponsesImpl
-
-    impl = OpenAIResponsesImpl(
-        config, deps[Api.models], deps[Api.inference], deps[Api.tool_groups], deps[Api.tool_runtime]
-    )
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/inline/openai_responses/config.py b/llama_stack/providers/inline/openai_responses/config.py
deleted file mode 100644
index f97b2fe68..000000000
--- a/llama_stack/providers/inline/openai_responses/config.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict
-
-from pydantic import BaseModel
-
-from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
-
-
-class OpenAIResponsesImplConfig(BaseModel):
-    kvstore: KVStoreConfig
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
-        return {
-            "kvstore": SqliteKVStoreConfig.sample_run_config(
-                __distro_dir__=__distro_dir__,
-                db_name="openai_responses.db",
-            )
-        }
diff --git a/llama_stack/providers/registry/openai_responses.py b/llama_stack/providers/registry/openai_responses.py
deleted file mode 100644
index b7f8d17a0..000000000
--- a/llama_stack/providers/registry/openai_responses.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import List
-
-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
-
-
-def available_providers() -> List[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.openai_responses,
-            provider_type="inline::openai-responses",
-            pip_packages=[],
-            module="llama_stack.providers.inline.openai_responses",
-            config_class="llama_stack.providers.inline.openai_responses.config.OpenAIResponsesImplConfig",
-            api_dependencies=[
-                Api.models,
-                Api.inference,
-                Api.tool_groups,
-                Api.tool_runtime,
-            ],
-        ),
-    ]
diff --git a/llama_stack/strong_typing/schema.py b/llama_stack/strong_typing/schema.py
index 0f5121906..e755b4c12 100644
--- a/llama_stack/strong_typing/schema.py
+++ b/llama_stack/strong_typing/schema.py
@@ -478,6 +478,8 @@ class JsonSchemaGenerator:
                 }
             return ret
         elif origin_type is Literal:
+            if len(typing.get_args(typ)) != 1:
+                print(f"Literal type {typ} has {len(typing.get_args(typ))} arguments")
             (literal_value,) = typing.get_args(typ)  # unpack value of literal type
             schema = self.type_to_schema(type(literal_value))
             schema["const"] = literal_value
diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml
index b344f5e5a..b2bbf853a 100644
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@@ -24,8 +24,6 @@ distribution_spec:
     - inline::braintrust
     telemetry:
     - inline::meta-reference
-    openai_responses:
-    - inline::openai-responses
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index a58417714..bb69496aa 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
-- openai_responses
 - safety
 - scoring
 - telemetry
@@ -92,14 +91,6 @@ providers:
       service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
-  openai_responses:
-  - provider_id: openai-responses
-    provider_type: inline::openai-responses
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 58087bba3..14f2da37e 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
-- openai_responses
 - safety
 - scoring
 - telemetry
@@ -85,14 +84,6 @@ providers:
       service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
       sinks: ${env.TELEMETRY_SINKS:console,sqlite}
       sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
-  openai_responses:
-  - provider_id: openai-responses
-    provider_type: inline::openai-responses
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/openai_responses.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py
index 12515d1ad..0f6c7659e 100644
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@@ -31,7 +31,6 @@ def get_distribution_template() -> DistributionTemplate:
         "datasetio": ["remote::huggingface", "inline::localfs"],
         "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
         "telemetry": ["inline::meta-reference"],
-        "openai_responses": ["inline::openai-responses"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index 81a47c5cd..834a3ecaf 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -24,8 +24,6 @@ distribution_spec:
     - inline::basic
     - inline::llm-as-judge
     - inline::braintrust
-    openai_responses:
-    - inline::openai-responses
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index fbeafce19..105ce896d 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
-- openai_responses
 - safety
 - scoring
 - telemetry
@@ -88,14 +87,6 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
-  openai_responses:
-  - provider_id: openai-responses
-    provider_type: inline::openai-responses
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/openai_responses.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 0c5d82c13..1f1613655 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -5,7 +5,6 @@ apis:
 - datasetio
 - eval
 - inference
-- openai_responses
 - safety
 - scoring
 - telemetry
@@ -83,14 +82,6 @@ providers:
     provider_type: inline::braintrust
     config:
       openai_api_key: ${env.OPENAI_API_KEY:}
-  openai_responses:
-  - provider_id: openai-responses
-    provider_type: inline::openai-responses
-    config:
-      kvstore:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/openai_responses.db
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py
index 85b7645b3..a2bd87c97 100644
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@@ -36,7 +36,6 @@ def get_distribution_template() -> DistributionTemplate:
         "eval": ["inline::meta-reference"],
         "datasetio": ["remote::huggingface", "inline::localfs"],
         "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "openai_responses": ["inline::openai-responses"],
         "tool_runtime": [
             "remote::brave-search",
             "remote::tavily-search",