From 2b6307467642e00d604480abc523fd989330e03d Mon Sep 17 00:00:00 2001
From: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Date: Tue, 10 Sep 2024 01:14:11 -0400
Subject: [PATCH] add /inference/chat_completion to SSE special case

---
 .../llama-stack-spec.html                     | 36 +++++++++----------
 .../llama-stack-spec.yaml                     | 18 +++++-----
 rfcs/openapi_generator/generate.py            |  5 ++-
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
index a05dc34fa..f5f5fa154 100644
--- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
+++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-09-09 11:19:39.855375"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-09-10 01:13:08.531639"
     },
     "servers": [
         {
@@ -141,7 +141,7 @@
                     "200": {
                         "description": "SSE-stream of these events.",
                         "content": {
-                            "application/json": {
+                            "text/event-stream": {
                                 "schema": {
                                     "$ref": "#/components/schemas/ChatCompletionResponseStreamChunk"
                                 }
@@ -6062,35 +6062,35 @@
         }
     ],
     "tags": [
-        {
-            "name": "Inference"
-        },
-        {
-            "name": "Datasets"
-        },
-        {
-            "name": "Evaluations"
-        },
-        {
-            "name": "Memory"
-        },
         {
             "name": "SyntheticDataGeneration"
         },
-        {
-            "name": "PostTraining"
-        },
         {
             "name": "RewardScoring"
         },
+        {
+            "name": "Datasets"
+        },
+        {
+            "name": "Memory"
+        },
+        {
+            "name": "AgenticSystem"
+        },
         {
             "name": "BatchInference"
         },
+        {
+            "name": "PostTraining"
+        },
+        {
+            "name": "Evaluations"
+        },
         {
             "name": "Telemetry"
         },
         {
-            "name": "AgenticSystem"
+            "name": "Inference"
         },
         {
             "name": "BatchChatCompletionRequest",
diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
index cfd01b291..3c3475fff 100644
--- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
+++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
@@ -2777,7 +2777,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-09-09 11:19:39.855375"
+    \ draft and subject to change.\n                Generated at 2024-09-10 01:13:08.531639"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -3302,7 +3302,7 @@ paths:
       responses:
         '200':
           content:
-            application/json:
+            text/event-stream:
               schema:
                 $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
           description: SSE-stream of these events.
@@ -3729,16 +3729,16 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: Inference
-- name: Datasets
-- name: Evaluations
-- name: Memory
 - name: SyntheticDataGeneration
-- name: PostTraining
 - name: RewardScoring
-- name: BatchInference
-- name: Telemetry
+- name: Datasets
+- name: Memory
 - name: AgenticSystem
+- name: BatchInference
+- name: PostTraining
+- name: Evaluations
+- name: Telemetry
+- name: Inference
 - description: <SchemaDefinition schemaRef="#/components/schemas/BatchChatCompletionRequest"
     />
   name: BatchChatCompletionRequest
diff --git a/rfcs/openapi_generator/generate.py b/rfcs/openapi_generator/generate.py
index ab9774e70..279389a47 100644
--- a/rfcs/openapi_generator/generate.py
+++ b/rfcs/openapi_generator/generate.py
@@ -35,7 +35,10 @@ from llama_toolchain.stack import LlamaStack
 
 
 # TODO: this should be fixed in the generator itself so it reads appropriate annotations
-STREAMING_ENDPOINTS = ["/agentic_system/turn/create"]
+STREAMING_ENDPOINTS = [
+    "/agentic_system/turn/create",
+    "/inference/chat_completion",
+]
 
 
 def patch_sse_stream_responses(spec: Specification):