chore(apis): unpublish deprecated /v1/inference apis (#3297)

# What does this PR do?

unpublish (make unavailable to users) the following apis -
 - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions`
- `/v1/inference/chat-completion`, replaced by
`/v1/openai/v1/chat/completions`
 - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings`
 - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches`
- `/v1/inference/batch-chat-completion`, replaced by
`/v1/openai/v1/batches`

note: the implementations are still available for internal use, e.g.
agents uses chat-completion.
This commit is contained in:
Matthew Farrellee 2025-09-27 14:20:06 -04:00 committed by GitHub
parent 60484c5c4e
commit 53b15725b6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 3134 additions and 1347 deletions

View file

@ -210,55 +210,6 @@
}
}
},
"/v1/inference/completion": {
"post": {
"responses": {
"200": {
"description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompletionResponse"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/CompletionResponseStreamChunk"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate a completion for the given content using the specified model.",
"description": "Generate a completion for the given content using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompletionRequest"
}
}
},
"required": true
}
}
},
"/v1/agents": {
"get": {
"responses": {
@ -7299,126 +7250,6 @@
"title": "ToolCallDelta",
"description": "A tool call content delta for streaming responses."
},
"CompletionRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content to generate a completion for."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "(Optional) Parameters to control the sampling strategy."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding."
},
"stream": {
"type": "boolean",
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
"required": [
"model_id",
"content"
],
"title": "CompletionRequest"
},
"CompletionResponse": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"content": {
"type": "string",
"description": "The generated completion text"
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Reason why generation stopped"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"content",
"stop_reason"
],
"title": "CompletionResponse",
"description": "Response from a completion request."
},
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"delta": {
"type": "string",
"description": "New content generated since last chunk. This can be one or more tokens."
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Optional reason why generation stopped, if complete"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"delta"
],
"title": "CompletionResponseStreamChunk",
"description": "A chunk of a streamed completion response."
},
"AgentConfig": {
"type": "object",
"properties": {

View file

@ -132,43 +132,6 @@ paths:
schema:
$ref: '#/components/schemas/ChatCompletionRequest'
required: true
/v1/inference/completion:
post:
responses:
'200':
description: >-
If stream=False, returns a CompletionResponse with the full completion.
If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
content:
application/json:
schema:
$ref: '#/components/schemas/CompletionResponse'
text/event-stream:
schema:
$ref: '#/components/schemas/CompletionResponseStreamChunk'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate a completion for the given content using the specified model.
description: >-
Generate a completion for the given content using the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CompletionRequest'
required: true
/v1/agents:
get:
responses:
@ -5292,112 +5255,6 @@ components:
title: ToolCallDelta
description: >-
A tool call content delta for streaming responses.
CompletionRequest:
type: object
properties:
model_id:
type: string
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content to generate a completion for.
sampling_params:
$ref: '#/components/schemas/SamplingParams'
description: >-
(Optional) Parameters to control the sampling strategy.
response_format:
$ref: '#/components/schemas/ResponseFormat'
description: >-
(Optional) Grammar specification for guided (structured) decoding.
stream:
type: boolean
description: >-
(Optional) If True, generate an SSE event stream of the response. Defaults
to False.
logprobs:
type: object
properties:
top_k:
type: integer
default: 0
description: >-
How many tokens (for each position) to return log probabilities for.
additionalProperties: false
description: >-
(Optional) If specified, log probabilities for each token position will
be returned.
additionalProperties: false
required:
- model_id
- content
title: CompletionRequest
CompletionResponse:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
content:
type: string
description: The generated completion text
stop_reason:
type: string
enum:
- end_of_turn
- end_of_message
- out_of_tokens
description: Reason why generation stopped
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- content
- stop_reason
title: CompletionResponse
description: Response from a completion request.
CompletionResponseStreamChunk:
type: object
properties:
metrics:
type: array
items:
$ref: '#/components/schemas/MetricInResponse'
description: >-
(Optional) List of metrics associated with the API response
delta:
type: string
description: >-
New content generated since last chunk. This can be one or more tokens.
stop_reason:
type: string
enum:
- end_of_turn
- end_of_message
- out_of_tokens
description: >-
Optional reason why generation stopped, if complete
logprobs:
type: array
items:
$ref: '#/components/schemas/TokenLogProbs'
description: >-
Optional log probabilities for generated tokens
additionalProperties: false
required:
- delta
title: CompletionResponseStreamChunk
description: >-
A chunk of a streamed completion response.
AgentConfig:
type: object
properties: