chore(apis): unpublish deprecated /v1/inference apis (#3297)

# What does this PR do?

unpublish (make unavailable to users) the following apis -
 - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions`
- `/v1/inference/chat-completion`, replaced by
`/v1/openai/v1/chat/completions`
 - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings`
 - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches`
- `/v1/inference/batch-chat-completion`, replaced by
`/v1/openai/v1/batches`

note: the implementations are still available for internal use, e.g.
agents uses chat-completion.
This commit is contained in:
Matthew Farrellee 2025-09-27 14:20:06 -04:00 committed by GitHub
parent 60484c5c4e
commit 53b15725b6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 3134 additions and 1347 deletions

View file

@ -210,55 +210,6 @@
}
}
},
"/v1/inference/completion": {
"post": {
"responses": {
"200": {
"description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompletionResponse"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/CompletionResponseStreamChunk"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate a completion for the given content using the specified model.",
"description": "Generate a completion for the given content using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompletionRequest"
}
}
},
"required": true
}
}
},
"/v1/agents": {
"get": {
"responses": {
@ -7299,126 +7250,6 @@
"title": "ToolCallDelta",
"description": "A tool call content delta for streaming responses."
},
"CompletionRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content to generate a completion for."
},
"sampling_params": {
"$ref": "#/components/schemas/SamplingParams",
"description": "(Optional) Parameters to control the sampling strategy."
},
"response_format": {
"$ref": "#/components/schemas/ResponseFormat",
"description": "(Optional) Grammar specification for guided (structured) decoding."
},
"stream": {
"type": "boolean",
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
},
"logprobs": {
"type": "object",
"properties": {
"top_k": {
"type": "integer",
"default": 0,
"description": "How many tokens (for each position) to return log probabilities for."
}
},
"additionalProperties": false,
"description": "(Optional) If specified, log probabilities for each token position will be returned."
}
},
"additionalProperties": false,
"required": [
"model_id",
"content"
],
"title": "CompletionRequest"
},
"CompletionResponse": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"content": {
"type": "string",
"description": "The generated completion text"
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Reason why generation stopped"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"content",
"stop_reason"
],
"title": "CompletionResponse",
"description": "Response from a completion request."
},
"CompletionResponseStreamChunk": {
"type": "object",
"properties": {
"metrics": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricInResponse"
},
"description": "(Optional) List of metrics associated with the API response"
},
"delta": {
"type": "string",
"description": "New content generated since last chunk. This can be one or more tokens."
},
"stop_reason": {
"type": "string",
"enum": [
"end_of_turn",
"end_of_message",
"out_of_tokens"
],
"description": "Optional reason why generation stopped, if complete"
},
"logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/TokenLogProbs"
},
"description": "Optional log probabilities for generated tokens"
}
},
"additionalProperties": false,
"required": [
"delta"
],
"title": "CompletionResponseStreamChunk",
"description": "A chunk of a streamed completion response."
},
"AgentConfig": {
"type": "object",
"properties": {