mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore(apis): unpublish deprecated /v1/inference apis (#3297)
# What does this PR do? unpublish (make unavailable to users) the following apis - - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions` - `/v1/inference/chat-completion`, replaced by `/v1/openai/v1/chat/completions` - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings` - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches` - `/v1/inference/batch-chat-completion`, replaced by `/v1/openai/v1/batches` note: the implementations are still available for internal use, e.g. agents uses chat-completion.
This commit is contained in:
parent
60484c5c4e
commit
53b15725b6
23 changed files with 3134 additions and 1347 deletions
143
docs/static/llama-stack-spec.yaml
vendored
143
docs/static/llama-stack-spec.yaml
vendored
|
@ -132,43 +132,6 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/ChatCompletionRequest'
|
||||
required: true
|
||||
/v1/inference/completion:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
If stream=False, returns a CompletionResponse with the full completion.
|
||||
If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CompletionResponse'
|
||||
text/event-stream:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CompletionResponseStreamChunk'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
summary: >-
|
||||
Generate a completion for the given content using the specified model.
|
||||
description: >-
|
||||
Generate a completion for the given content using the specified model.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CompletionRequest'
|
||||
required: true
|
||||
/v1/agents:
|
||||
get:
|
||||
responses:
|
||||
|
@ -5292,112 +5255,6 @@ components:
|
|||
title: ToolCallDelta
|
||||
description: >-
|
||||
A tool call content delta for streaming responses.
|
||||
CompletionRequest:
|
||||
type: object
|
||||
properties:
|
||||
model_id:
|
||||
type: string
|
||||
description: >-
|
||||
The identifier of the model to use. The model must be registered with
|
||||
Llama Stack and available via the /models endpoint.
|
||||
content:
|
||||
$ref: '#/components/schemas/InterleavedContent'
|
||||
description: >-
|
||||
The content to generate a completion for.
|
||||
sampling_params:
|
||||
$ref: '#/components/schemas/SamplingParams'
|
||||
description: >-
|
||||
(Optional) Parameters to control the sampling strategy.
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: >-
|
||||
(Optional) Grammar specification for guided (structured) decoding.
|
||||
stream:
|
||||
type: boolean
|
||||
description: >-
|
||||
(Optional) If True, generate an SSE event stream of the response. Defaults
|
||||
to False.
|
||||
logprobs:
|
||||
type: object
|
||||
properties:
|
||||
top_k:
|
||||
type: integer
|
||||
default: 0
|
||||
description: >-
|
||||
How many tokens (for each position) to return log probabilities for.
|
||||
additionalProperties: false
|
||||
description: >-
|
||||
(Optional) If specified, log probabilities for each token position will
|
||||
be returned.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model_id
|
||||
- content
|
||||
title: CompletionRequest
|
||||
CompletionResponse:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
content:
|
||||
type: string
|
||||
description: The generated completion text
|
||||
stop_reason:
|
||||
type: string
|
||||
enum:
|
||||
- end_of_turn
|
||||
- end_of_message
|
||||
- out_of_tokens
|
||||
description: Reason why generation stopped
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- stop_reason
|
||||
title: CompletionResponse
|
||||
description: Response from a completion request.
|
||||
CompletionResponseStreamChunk:
|
||||
type: object
|
||||
properties:
|
||||
metrics:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/MetricInResponse'
|
||||
description: >-
|
||||
(Optional) List of metrics associated with the API response
|
||||
delta:
|
||||
type: string
|
||||
description: >-
|
||||
New content generated since last chunk. This can be one or more tokens.
|
||||
stop_reason:
|
||||
type: string
|
||||
enum:
|
||||
- end_of_turn
|
||||
- end_of_message
|
||||
- out_of_tokens
|
||||
description: >-
|
||||
Optional reason why generation stopped, if complete
|
||||
logprobs:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/TokenLogProbs'
|
||||
description: >-
|
||||
Optional log probabilities for generated tokens
|
||||
additionalProperties: false
|
||||
required:
|
||||
- delta
|
||||
title: CompletionResponseStreamChunk
|
||||
description: >-
|
||||
A chunk of a streamed completion response.
|
||||
AgentConfig:
|
||||
type: object
|
||||
properties:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue