mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(apis): unpublish deprecated /v1/inference apis (#3297)
# What does this PR do? unpublish (make unavailable to users) the following apis - - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions` - `/v1/inference/chat-completion`, replaced by `/v1/openai/v1/chat/completions` - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings` - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches` - `/v1/inference/batch-chat-completion`, replaced by `/v1/openai/v1/batches` note: the implementations are still available for internal use, e.g. agents uses chat-completion.
This commit is contained in:
parent
60484c5c4e
commit
53b15725b6
23 changed files with 3134 additions and 1347 deletions
169
docs/static/llama-stack-spec.html
vendored
169
docs/static/llama-stack-spec.html
vendored
|
@ -210,55 +210,6 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/inference/completion": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CompletionResponse"
|
||||
}
|
||||
},
|
||||
"text/event-stream": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CompletionResponseStreamChunk"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Inference"
|
||||
],
|
||||
"summary": "Generate a completion for the given content using the specified model.",
|
||||
"description": "Generate a completion for the given content using the specified model.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CompletionRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/agents": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
@ -7299,126 +7250,6 @@
|
|||
"title": "ToolCallDelta",
|
||||
"description": "A tool call content delta for streaming responses."
|
||||
},
|
||||
"CompletionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||
},
|
||||
"content": {
|
||||
"$ref": "#/components/schemas/InterleavedContent",
|
||||
"description": "The content to generate a completion for."
|
||||
},
|
||||
"sampling_params": {
|
||||
"$ref": "#/components/schemas/SamplingParams",
|
||||
"description": "(Optional) Parameters to control the sampling strategy."
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "(Optional) Grammar specification for guided (structured) decoding."
|
||||
},
|
||||
"stream": {
|
||||
"type": "boolean",
|
||||
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"default": 0,
|
||||
"description": "How many tokens (for each position) to return log probabilities for."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"description": "(Optional) If specified, log probabilities for each token position will be returned."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model_id",
|
||||
"content"
|
||||
],
|
||||
"title": "CompletionRequest"
|
||||
},
|
||||
"CompletionResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The generated completion text"
|
||||
},
|
||||
"stop_reason": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"end_of_turn",
|
||||
"end_of_message",
|
||||
"out_of_tokens"
|
||||
],
|
||||
"description": "Reason why generation stopped"
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/TokenLogProbs"
|
||||
},
|
||||
"description": "Optional log probabilities for generated tokens"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"stop_reason"
|
||||
],
|
||||
"title": "CompletionResponse",
|
||||
"description": "Response from a completion request."
|
||||
},
|
||||
"CompletionResponseStreamChunk": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"metrics": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/MetricInResponse"
|
||||
},
|
||||
"description": "(Optional) List of metrics associated with the API response"
|
||||
},
|
||||
"delta": {
|
||||
"type": "string",
|
||||
"description": "New content generated since last chunk. This can be one or more tokens."
|
||||
},
|
||||
"stop_reason": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"end_of_turn",
|
||||
"end_of_message",
|
||||
"out_of_tokens"
|
||||
],
|
||||
"description": "Optional reason why generation stopped, if complete"
|
||||
},
|
||||
"logprobs": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/TokenLogProbs"
|
||||
},
|
||||
"description": "Optional log probabilities for generated tokens"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"delta"
|
||||
],
|
||||
"title": "CompletionResponseStreamChunk",
|
||||
"description": "A chunk of a streamed completion response."
|
||||
},
|
||||
"AgentConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue