mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-15 06:00:48 +00:00
Merge 306625025a
into e69acbafbf
This commit is contained in:
commit
7dbd03be07
8 changed files with 336 additions and 1 deletions
132
docs/_static/llama-stack-spec.html
vendored
132
docs/_static/llama-stack-spec.html
vendored
|
@ -4605,6 +4605,49 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/inference/rerank": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "RerankResponse with indices sorted by relevance score (descending).",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/RerankResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Inference"
|
||||
],
|
||||
"description": "Rerank a list of documents based on their relevance to a query.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/RerankRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -16585,6 +16628,95 @@
|
|||
],
|
||||
"title": "RegisterVectorDbRequest"
|
||||
},
|
||||
"RerankRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the reranking model to use."
|
||||
},
|
||||
"query": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
|
||||
}
|
||||
],
|
||||
"description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
|
||||
},
|
||||
"items": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
|
||||
},
|
||||
"max_num_results": {
|
||||
"type": "integer",
|
||||
"description": "(Optional) Maximum number of results to return. Default: returns all."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"model",
|
||||
"query",
|
||||
"items"
|
||||
],
|
||||
"title": "RerankRequest"
|
||||
},
|
||||
"RerankData": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The original index of the document in the input list"
|
||||
},
|
||||
"relevance_score": {
|
||||
"type": "number",
|
||||
"description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"index",
|
||||
"relevance_score"
|
||||
],
|
||||
"title": "RerankData",
|
||||
"description": "A single rerank result from a reranking response."
|
||||
},
|
||||
"RerankResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/RerankData"
|
||||
},
|
||||
"description": "List of rerank result objects, sorted by relevance score (descending)"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"data"
|
||||
],
|
||||
"title": "RerankResponse",
|
||||
"description": "Response from a reranking request."
|
||||
},
|
||||
"ResumeAgentTurnRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
101
docs/_static/llama-stack-spec.yaml
vendored
101
docs/_static/llama-stack-spec.yaml
vendored
|
@ -3264,6 +3264,37 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/QueryTracesRequest'
|
||||
required: true
|
||||
/v1/inference/rerank:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
RerankResponse with indices sorted by relevance score (descending).
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/RerankResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Inference
|
||||
description: >-
|
||||
Rerank a list of documents based on their relevance to a query.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/RerankRequest'
|
||||
required: true
|
||||
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
|
||||
post:
|
||||
responses:
|
||||
|
@ -12335,6 +12366,76 @@ components:
|
|||
- vector_db_id
|
||||
- embedding_model
|
||||
title: RegisterVectorDbRequest
|
||||
RerankRequest:
|
||||
type: object
|
||||
properties:
|
||||
model:
|
||||
type: string
|
||||
description: >-
|
||||
The identifier of the reranking model to use.
|
||||
query:
|
||||
oneOf:
|
||||
- type: string
|
||||
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
|
||||
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
|
||||
description: >-
|
||||
The search query to rank items against. Can be a string, text content
|
||||
part, or image content part. The input must not exceed the model's max
|
||||
input token length.
|
||||
items:
|
||||
type: array
|
||||
items:
|
||||
oneOf:
|
||||
- type: string
|
||||
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
|
||||
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
|
||||
description: >-
|
||||
List of items to rerank. Each item can be a string, text content part,
|
||||
or image content part. Each input must not exceed the model's max input
|
||||
token length.
|
||||
max_num_results:
|
||||
type: integer
|
||||
description: >-
|
||||
(Optional) Maximum number of results to return. Default: returns all.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- query
|
||||
- items
|
||||
title: RerankRequest
|
||||
RerankData:
|
||||
type: object
|
||||
properties:
|
||||
index:
|
||||
type: integer
|
||||
description: >-
|
||||
The original index of the document in the input list
|
||||
relevance_score:
|
||||
type: number
|
||||
description: >-
|
||||
The relevance score from the model output. Values are inverted when applicable
|
||||
so that higher scores indicate greater relevance.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- index
|
||||
- relevance_score
|
||||
title: RerankData
|
||||
description: >-
|
||||
A single rerank result from a reranking response.
|
||||
RerankResponse:
|
||||
type: object
|
||||
properties:
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/RerankData'
|
||||
description: >-
|
||||
List of rerank result objects, sorted by relevance score (descending)
|
||||
additionalProperties: false
|
||||
required:
|
||||
- data
|
||||
title: RerankResponse
|
||||
description: Response from a reranking request.
|
||||
ResumeAgentTurnRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
|
|
@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
|
|||
embeddings: list[list[float]]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RerankData(BaseModel):
|
||||
"""A single rerank result from a reranking response.
|
||||
|
||||
:param index: The original index of the document in the input list
|
||||
:param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
|
||||
"""
|
||||
|
||||
index: int
|
||||
relevance_score: float
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class RerankResponse(BaseModel):
|
||||
"""Response from a reranking request.
|
||||
|
||||
:param data: List of rerank result objects, sorted by relevance score (descending)
|
||||
"""
|
||||
|
||||
data: list[RerankData]
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIChatCompletionContentPartTextParam(BaseModel):
|
||||
"""Text content part for OpenAI-compatible chat completion messages.
|
||||
|
@ -1131,6 +1153,24 @@ class InferenceProvider(Protocol):
|
|||
"""
|
||||
...
|
||||
|
||||
@webmethod(route="/inference/rerank", method="POST", experimental=True)
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
"""Rerank a list of documents based on their relevance to a query.
|
||||
|
||||
:param model: The identifier of the reranking model to use.
|
||||
:param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
|
||||
:param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
|
||||
:param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
|
||||
:returns: RerankResponse with indices sorted by relevance score (descending).
|
||||
"""
|
||||
raise NotImplementedError("Reranking is not implemented")
|
||||
|
||||
@webmethod(route="/openai/v1/completions", method="POST")
|
||||
async def openai_completion(
|
||||
self,
|
||||
|
|
|
@ -33,6 +33,9 @@ from llama_stack.apis.inference import (
|
|||
InterleavedContent,
|
||||
LogProbConfig,
|
||||
Message,
|
||||
OpenAIChatCompletionContentPartImageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
RerankResponse,
|
||||
ResponseFormat,
|
||||
SamplingParams,
|
||||
StopReason,
|
||||
|
@ -442,6 +445,15 @@ class MetaReferenceInferenceImpl(
|
|||
results = await self._nonstream_chat_completion(request_batch)
|
||||
return BatchChatCompletionResponse(batch=results)
|
||||
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
raise NotImplementedError("Reranking is not supported for Meta Reference")
|
||||
|
||||
async def _nonstream_chat_completion(
|
||||
self, request_batch: list[ChatCompletionRequest]
|
||||
) -> list[ChatCompletionResponse]:
|
||||
|
|
|
@ -13,6 +13,9 @@ from llama_stack.apis.inference import (
|
|||
InterleavedContent,
|
||||
LogProbConfig,
|
||||
Message,
|
||||
OpenAIChatCompletionContentPartImageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
RerankResponse,
|
||||
ResponseFormat,
|
||||
SamplingParams,
|
||||
ToolChoice,
|
||||
|
@ -122,3 +125,12 @@ class SentenceTransformersInferenceImpl(
|
|||
logprobs: LogProbConfig | None = None,
|
||||
):
|
||||
raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
|
||||
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
raise NotImplementedError("Reranking is not supported for Sentence Transformers")
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
# the root directory of this source tree.
|
||||
import logging
|
||||
|
||||
from llama_stack.apis.inference import (
|
||||
OpenAIChatCompletionContentPartImageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
RerankResponse,
|
||||
)
|
||||
from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
|
||||
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
|
||||
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
||||
|
@ -55,3 +60,12 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
|
|||
|
||||
async def shutdown(self):
|
||||
await super().shutdown()
|
||||
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
raise NotImplementedError("Reranking is not supported for Llama OpenAI Compat")
|
||||
|
|
|
@ -37,11 +37,14 @@ from llama_stack.apis.inference import (
|
|||
Message,
|
||||
OpenAIChatCompletion,
|
||||
OpenAIChatCompletionChunk,
|
||||
OpenAIChatCompletionContentPartImageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
OpenAICompletion,
|
||||
OpenAIEmbeddingsResponse,
|
||||
OpenAIEmbeddingUsage,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatParam,
|
||||
RerankResponse,
|
||||
ResponseFormat,
|
||||
SamplingParams,
|
||||
TextTruncation,
|
||||
|
@ -641,6 +644,15 @@ class OllamaInferenceAdapter(
|
|||
):
|
||||
raise NotImplementedError("Batch chat completion is not supported for Ollama")
|
||||
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
raise NotImplementedError("Reranking is not supported for Ollama")
|
||||
|
||||
|
||||
async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
|
||||
async def _convert_content(content) -> dict:
|
||||
|
|
|
@ -39,12 +39,15 @@ from llama_stack.apis.inference import (
|
|||
Message,
|
||||
ModelStore,
|
||||
OpenAIChatCompletion,
|
||||
OpenAIChatCompletionContentPartImageParam,
|
||||
OpenAIChatCompletionContentPartTextParam,
|
||||
OpenAICompletion,
|
||||
OpenAIEmbeddingData,
|
||||
OpenAIEmbeddingsResponse,
|
||||
OpenAIEmbeddingUsage,
|
||||
OpenAIMessageParam,
|
||||
OpenAIResponseFormatParam,
|
||||
RerankResponse,
|
||||
ResponseFormat,
|
||||
SamplingParams,
|
||||
TextTruncation,
|
||||
|
@ -732,4 +735,13 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
response_format: ResponseFormat | None = None,
|
||||
logprobs: LogProbConfig | None = None,
|
||||
):
|
||||
raise NotImplementedError("Batch chat completion is not supported for Ollama")
|
||||
raise NotImplementedError("Batch chat completion is not supported for vLLM")
|
||||
|
||||
async def rerank(
|
||||
self,
|
||||
model: str,
|
||||
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||
max_num_results: int | None = None,
|
||||
) -> RerankResponse:
|
||||
raise NotImplementedError("Reranking is not supported for vLLM")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue