diff --git a/source/api_definitions.py b/source/api_definitions.py
index 84e0954f7..626fa0bdd 100644
--- a/source/api_definitions.py
+++ b/source/api_definitions.py
@@ -143,22 +143,26 @@ class BatchChatCompletionRequest:
class Inference(Protocol):
- """Set of methods that can be called on the inference service."""
+
+ @webmethod(route="/inference/completion")
def post_completion(
self,
request: CompletionRequest,
) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...
+ @webmethod(route="/inference/chat_completion")
def post_chat_completion(
self,
request: ChatCompletionRequest,
) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
+ @webmethod(route="/inference/batch_completion")
def post_batch_completion(
self,
request: BatchCompletionRequest,
) -> List[CompletionResponse]: ...
+ @webmethod(route="/inference/batch_chat_completion")
def post_batch_chat_completion(
self,
request: BatchChatCompletionRequest,
diff --git a/source/openapi.html b/source/openapi.html
index a4b8e7ca7..ceb92c59f 100644
--- a/source/openapi.html
+++ b/source/openapi.html
@@ -386,7 +386,7 @@
]
}
},
- "/batch_chat_completion": {
+ "/inference/batch_chat_completion": {
"post": {
"responses": {
"200": {
@@ -416,7 +416,7 @@
}
}
},
- "/batch_completion": {
+ "/inference/batch_completion": {
"post": {
"responses": {
"200": {
@@ -446,7 +446,7 @@
}
}
},
- "/chat_completion": {
+ "/inference/chat_completion": {
"post": {
"responses": {
"200": {
@@ -483,7 +483,7 @@
}
}
},
- "/completion": {
+ "/inference/completion": {
"post": {
"responses": {
"200": {
@@ -3317,27 +3317,26 @@
}
],
"tags": [
+ {
+ "name": "RewardScoring"
+ },
{
"name": "PostTraining"
},
- {
- "name": "Inference",
- "x-displayName": "Set of methods that can be called on the inference service."
- },
- {
- "name": "MemoryBanks"
- },
- {
- "name": "Datasets"
- },
{
"name": "AgenticSystem"
},
+ {
+ "name": "Datasets"
+ },
{
"name": "SyntheticDataGeneration"
},
{
- "name": "RewardScoring"
+ "name": "Inference"
+ },
+ {
+ "name": "MemoryBanks"
},
{
"name": "ShieldConfig",
diff --git a/source/openapi.yaml b/source/openapi.yaml
index e14d457e2..dc25460bf 100644
--- a/source/openapi.yaml
+++ b/source/openapi.yaml
@@ -1563,83 +1563,6 @@ paths:
agent execution response.
tags:
- AgenticSystem
- /batch_chat_completion:
- post:
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchChatCompletionRequest'
- required: true
- responses:
- '200':
- content:
- application/jsonl:
- schema:
- $ref: '#/components/schemas/ChatCompletionResponse'
- description: OK
- tags:
- - Inference
- /batch_completion:
- post:
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/BatchCompletionRequest'
- required: true
- responses:
- '200':
- content:
- application/jsonl:
- schema:
- $ref: '#/components/schemas/CompletionResponse'
- description: OK
- tags:
- - Inference
- /chat_completion:
- post:
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/ChatCompletionRequest'
- required: true
- responses:
- '200':
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/ChatCompletionResponse'
- - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
- description: Normal chat completion response. **OR** Streamed chat completion
- response. The actual response is a series of such objects.
- tags:
- - Inference
- /completion:
- post:
- parameters: []
- requestBody:
- content:
- application/json:
- schema:
- $ref: '#/components/schemas/CompletionRequest'
- required: true
- responses:
- '200':
- content:
- application/json:
- schema:
- oneOf:
- - $ref: '#/components/schemas/CompletionResponse'
- - $ref: '#/components/schemas/CompletionResponseStreamChunk'
- description: Normal completion response. **OR** streamed completion response.
- tags:
- - Inference
/datasets/create:
post:
parameters: []
@@ -1684,6 +1607,83 @@ paths:
description: OK
tags:
- Datasets
+ /inference/batch_chat_completion:
+ post:
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/BatchChatCompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/jsonl:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionResponse'
+ description: OK
+ tags:
+ - Inference
+ /inference/batch_completion:
+ post:
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/BatchCompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/jsonl:
+ schema:
+ $ref: '#/components/schemas/CompletionResponse'
+ description: OK
+ tags:
+ - Inference
+ /inference/chat_completion:
+ post:
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ChatCompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/ChatCompletionResponse'
+ - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
+ description: Normal chat completion response. **OR** Streamed chat completion
+ response. The actual response is a series of such objects.
+ tags:
+ - Inference
+ /inference/completion:
+ post:
+ parameters: []
+ requestBody:
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/CompletionRequest'
+ required: true
+ responses:
+ '200':
+ content:
+ application/json:
+ schema:
+ oneOf:
+ - $ref: '#/components/schemas/CompletionResponse'
+ - $ref: '#/components/schemas/CompletionResponseStreamChunk'
+ description: Normal completion response. **OR** streamed completion response.
+ tags:
+ - Inference
/memory_bank/delete:
post:
parameters:
@@ -2015,14 +2015,13 @@ security:
servers:
- url: http://any-hosted-llama-stack.com
tags:
-- name: PostTraining
-- name: Inference
- x-displayName: Set of methods that can be called on the inference service.
-- name: MemoryBanks
-- name: Datasets
-- name: AgenticSystem
-- name: SyntheticDataGeneration
- name: RewardScoring
+- name: PostTraining
+- name: AgenticSystem
+- name: Datasets
+- name: SyntheticDataGeneration
+- name: Inference
+- name: MemoryBanks
- description:
name: ShieldConfig
- description: