forked from phoenix-oss/llama-stack-mirror
feat: OpenAI-Compatible models, completions, chat/completions (#1894)
# What does this PR do? This stubs in some OpenAI server-side compatibility with three new endpoints: /v1/openai/v1/models /v1/openai/v1/completions /v1/openai/v1/chat/completions This gives common inference apps using OpenAI clients the ability to talk to Llama Stack using an endpoint like http://localhost:8321/v1/openai/v1 . The two "v1" instances in there isn't awesome, but the thinking is that Llama Stack's API is v1 and then our OpenAI compatibility layer is compatible with OpenAI V1. And, some OpenAI clients implicitly assume the URL ends with "v1", so this gives maximum compatibility. The openai models endpoint is implemented in the routing layer, and just returns all the models Llama Stack knows about. The following providers should be working with the new OpenAI completions and chat/completions API: * remote::anthropic (untested) * remote::cerebras-openai-compat (untested) * remote::fireworks (tested) * remote::fireworks-openai-compat (untested) * remote::gemini (untested) * remote::groq-openai-compat (untested) * remote::nvidia (tested) * remote::ollama (tested) * remote::openai (untested) * remote::passthrough (untested) * remote::sambanova-openai-compat (untested) * remote::together (tested) * remote::together-openai-compat (untested) * remote::vllm (tested) The goal to support this for every inference provider - proxying directly to the provider's OpenAI endpoint for OpenAI-compatible providers. For providers that don't have an OpenAI-compatible API, we'll add a mixin to translate incoming OpenAI requests to Llama Stack inference requests and translate the Llama Stack inference responses to OpenAI responses. This is related to #1817 but is a bit larger in scope than just chat completions, as I have real use-cases that need the older completions API as well. ## Test Plan ### vLLM ``` VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" llama stack build --template remote-vllm --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct" ``` ### ollama ``` INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" llama stack build --template ollama --image-type venv --run LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-q8_0" ``` ## Documentation Run a Llama Stack distribution that uses one of the providers mentioned in the list above. Then, use your favorite OpenAI client to send completion or chat completion requests with the base_url set to http://localhost:8321/v1/openai/v1 . Replace "localhost:8321" with the host and port of your Llama Stack server, if different. --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
24d70cedca
commit
2b2db5fbda
27 changed files with 3265 additions and 20 deletions
932
docs/_static/llama-stack-spec.html
vendored
932
docs/_static/llama-stack-spec.html
vendored
|
@ -3092,6 +3092,125 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/openai/v1/chat/completions": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIChatCompletion"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Inference"
|
||||||
|
],
|
||||||
|
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
|
||||||
|
"parameters": [],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenaiChatCompletionRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/v1/openai/v1/completions": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenAICompletion"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Inference"
|
||||||
|
],
|
||||||
|
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
|
||||||
|
"parameters": [],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenaiCompletionRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"/v1/openai/v1/models": {
|
||||||
|
"get": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "OK",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIListModelsResponse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Models"
|
||||||
|
],
|
||||||
|
"description": "",
|
||||||
|
"parameters": []
|
||||||
|
}
|
||||||
|
},
|
||||||
"/v1/post-training/preference-optimize": {
|
"/v1/post-training/preference-optimize": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -8713,6 +8832,819 @@
|
||||||
],
|
],
|
||||||
"title": "LogEventRequest"
|
"title": "LogEventRequest"
|
||||||
},
|
},
|
||||||
|
"OpenAIAssistantMessageParam": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"role": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "assistant",
|
||||||
|
"default": "assistant",
|
||||||
|
"description": "Must be \"assistant\" to identify this as the model's response"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"$ref": "#/components/schemas/InterleavedContent",
|
||||||
|
"description": "The content of the model's response"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The name of the assistant message participant."
|
||||||
|
},
|
||||||
|
"tool_calls": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/ToolCall"
|
||||||
|
},
|
||||||
|
"description": "List of tool calls. Each tool call is a ToolCall object."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"role",
|
||||||
|
"content"
|
||||||
|
],
|
||||||
|
"title": "OpenAIAssistantMessageParam",
|
||||||
|
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
||||||
|
},
|
||||||
|
"OpenAIDeveloperMessageParam": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"role": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "developer",
|
||||||
|
"default": "developer",
|
||||||
|
"description": "Must be \"developer\" to identify this as a developer message"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"$ref": "#/components/schemas/InterleavedContent",
|
||||||
|
"description": "The content of the developer message"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The name of the developer message participant."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"role",
|
||||||
|
"content"
|
||||||
|
],
|
||||||
|
"title": "OpenAIDeveloperMessageParam",
|
||||||
|
"description": "A message from the developer in an OpenAI-compatible chat completion request."
|
||||||
|
},
|
||||||
|
"OpenAIMessageParam": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIUserMessageParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAISystemMessageParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIAssistantMessageParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIToolMessageParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIDeveloperMessageParam"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discriminator": {
|
||||||
|
"propertyName": "role",
|
||||||
|
"mapping": {
|
||||||
|
"user": "#/components/schemas/OpenAIUserMessageParam",
|
||||||
|
"system": "#/components/schemas/OpenAISystemMessageParam",
|
||||||
|
"assistant": "#/components/schemas/OpenAIAssistantMessageParam",
|
||||||
|
"tool": "#/components/schemas/OpenAIToolMessageParam",
|
||||||
|
"developer": "#/components/schemas/OpenAIDeveloperMessageParam"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"OpenAISystemMessageParam": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"role": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "system",
|
||||||
|
"default": "system",
|
||||||
|
"description": "Must be \"system\" to identify this as a system message"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"$ref": "#/components/schemas/InterleavedContent",
|
||||||
|
"description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The name of the system message participant."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"role",
|
||||||
|
"content"
|
||||||
|
],
|
||||||
|
"title": "OpenAISystemMessageParam",
|
||||||
|
"description": "A system message providing instructions or context to the model."
|
||||||
|
},
|
||||||
|
"OpenAIToolMessageParam": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"role": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "tool",
|
||||||
|
"default": "tool",
|
||||||
|
"description": "Must be \"tool\" to identify this as a tool response"
|
||||||
|
},
|
||||||
|
"tool_call_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier for the tool call this response is for"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"$ref": "#/components/schemas/InterleavedContent",
|
||||||
|
"description": "The response content from the tool"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"role",
|
||||||
|
"tool_call_id",
|
||||||
|
"content"
|
||||||
|
],
|
||||||
|
"title": "OpenAIToolMessageParam",
|
||||||
|
"description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request."
|
||||||
|
},
|
||||||
|
"OpenAIUserMessageParam": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"role": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "user",
|
||||||
|
"default": "user",
|
||||||
|
"description": "Must be \"user\" to identify this as a user message"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"$ref": "#/components/schemas/InterleavedContent",
|
||||||
|
"description": "The content of the message, which can include text and other media"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The name of the user message participant."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"role",
|
||||||
|
"content"
|
||||||
|
],
|
||||||
|
"title": "OpenAIUserMessageParam",
|
||||||
|
"description": "A message from the user in an OpenAI-compatible chat completion request."
|
||||||
|
},
|
||||||
|
"OpenaiChatCompletionRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||||
|
},
|
||||||
|
"messages": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIMessageParam"
|
||||||
|
},
|
||||||
|
"description": "List of messages in the conversation"
|
||||||
|
},
|
||||||
|
"frequency_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The penalty for repeated tokens"
|
||||||
|
},
|
||||||
|
"function_call": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "(Optional) The function call to use"
|
||||||
|
},
|
||||||
|
"functions": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "(Optional) List of functions to use"
|
||||||
|
},
|
||||||
|
"logit_bias": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"description": "(Optional) The logit bias to use"
|
||||||
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) The log probabilities to use"
|
||||||
|
},
|
||||||
|
"max_completion_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The maximum number of tokens to generate"
|
||||||
|
},
|
||||||
|
"max_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The maximum number of tokens to generate"
|
||||||
|
},
|
||||||
|
"n": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The number of completions to generate"
|
||||||
|
},
|
||||||
|
"parallel_tool_calls": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) Whether to parallelize tool calls"
|
||||||
|
},
|
||||||
|
"presence_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The penalty for repeated tokens"
|
||||||
|
},
|
||||||
|
"response_format": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "(Optional) The response format to use"
|
||||||
|
},
|
||||||
|
"seed": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The seed to use"
|
||||||
|
},
|
||||||
|
"stop": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "(Optional) The stop tokens to use"
|
||||||
|
},
|
||||||
|
"stream": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) Whether to stream the response"
|
||||||
|
},
|
||||||
|
"stream_options": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "(Optional) The stream options to use"
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The temperature to use"
|
||||||
|
},
|
||||||
|
"tool_choice": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "(Optional) The tool choice to use"
|
||||||
|
},
|
||||||
|
"tools": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": "(Optional) The tools to use"
|
||||||
|
},
|
||||||
|
"top_logprobs": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The top log probabilities to use"
|
||||||
|
},
|
||||||
|
"top_p": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The top p to use"
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The user to use"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"model",
|
||||||
|
"messages"
|
||||||
|
],
|
||||||
|
"title": "OpenaiChatCompletionRequest"
|
||||||
|
},
|
||||||
|
"OpenAIChatCompletion": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The ID of the chat completion"
|
||||||
|
},
|
||||||
|
"choices": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIChoice"
|
||||||
|
},
|
||||||
|
"description": "List of choices"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "chat.completion",
|
||||||
|
"default": "chat.completion",
|
||||||
|
"description": "The object type, which will be \"chat.completion\""
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The Unix timestamp in seconds when the chat completion was created"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The model that was used to generate the chat completion"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"choices",
|
||||||
|
"object",
|
||||||
|
"created",
|
||||||
|
"model"
|
||||||
|
],
|
||||||
|
"title": "OpenAIChatCompletion",
|
||||||
|
"description": "Response from an OpenAI-compatible chat completion request."
|
||||||
|
},
|
||||||
|
"OpenAIChoice": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"message": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIMessageParam",
|
||||||
|
"description": "The message from the model"
|
||||||
|
},
|
||||||
|
"finish_reason": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The reason the model stopped generating"
|
||||||
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIChoiceLogprobs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"message",
|
||||||
|
"finish_reason",
|
||||||
|
"index"
|
||||||
|
],
|
||||||
|
"title": "OpenAIChoice",
|
||||||
|
"description": "A choice from an OpenAI-compatible chat completion response."
|
||||||
|
},
|
||||||
|
"OpenAIChoiceLogprobs": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"content": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAITokenLogProb"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"refusal": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAITokenLogProb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"title": "OpenAIChoiceLogprobs",
|
||||||
|
"description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response."
|
||||||
|
},
|
||||||
|
"OpenAITokenLogProb": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"token": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"bytes": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"logprob": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"top_logprobs": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAITopLogProb"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"token",
|
||||||
|
"logprob",
|
||||||
|
"top_logprobs"
|
||||||
|
],
|
||||||
|
"title": "OpenAITokenLogProb",
|
||||||
|
"description": "The log probability for a token from an OpenAI-compatible chat completion response."
|
||||||
|
},
|
||||||
|
"OpenAITopLogProb": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"token": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"bytes": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"logprob": {
|
||||||
|
"type": "number"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"token",
|
||||||
|
"logprob"
|
||||||
|
],
|
||||||
|
"title": "OpenAITopLogProb",
|
||||||
|
"description": "The top log probability for a token from an OpenAI-compatible chat completion response."
|
||||||
|
},
|
||||||
|
"OpenaiCompletionRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
|
||||||
|
},
|
||||||
|
"prompt": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "The prompt to generate a completion for"
|
||||||
|
},
|
||||||
|
"best_of": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The number of completions to generate"
|
||||||
|
},
|
||||||
|
"echo": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) Whether to echo the prompt"
|
||||||
|
},
|
||||||
|
"frequency_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The penalty for repeated tokens"
|
||||||
|
},
|
||||||
|
"logit_bias": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"description": "(Optional) The logit bias to use"
|
||||||
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) The log probabilities to use"
|
||||||
|
},
|
||||||
|
"max_tokens": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The maximum number of tokens to generate"
|
||||||
|
},
|
||||||
|
"n": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The number of completions to generate"
|
||||||
|
},
|
||||||
|
"presence_penalty": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The penalty for repeated tokens"
|
||||||
|
},
|
||||||
|
"seed": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) The seed to use"
|
||||||
|
},
|
||||||
|
"stop": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "(Optional) The stop tokens to use"
|
||||||
|
},
|
||||||
|
"stream": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "(Optional) Whether to stream the response"
|
||||||
|
},
|
||||||
|
"stream_options": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "null"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "(Optional) The stream options to use"
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The temperature to use"
|
||||||
|
},
|
||||||
|
"top_p": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "(Optional) The top p to use"
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "(Optional) The user to use"
|
||||||
|
},
|
||||||
|
"guided_choice": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"prompt_logprobs": {
|
||||||
|
"type": "integer"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"model",
|
||||||
|
"prompt"
|
||||||
|
],
|
||||||
|
"title": "OpenaiCompletionRequest"
|
||||||
|
},
|
||||||
|
"OpenAICompletion": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"choices": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAICompletionChoice"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "text_completion",
|
||||||
|
"default": "text_completion"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"choices",
|
||||||
|
"created",
|
||||||
|
"model",
|
||||||
|
"object"
|
||||||
|
],
|
||||||
|
"title": "OpenAICompletion",
|
||||||
|
"description": "Response from an OpenAI-compatible completion request."
|
||||||
|
},
|
||||||
|
"OpenAICompletionChoice": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"finish_reason": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"index": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"logprobs": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIChoiceLogprobs"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"finish_reason",
|
||||||
|
"text",
|
||||||
|
"index"
|
||||||
|
],
|
||||||
|
"title": "OpenAICompletionChoice",
|
||||||
|
"description": "A choice from an OpenAI-compatible completion response."
|
||||||
|
},
|
||||||
|
"OpenAIModel": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"object": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "model",
|
||||||
|
"default": "model"
|
||||||
|
},
|
||||||
|
"created": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"owned_by": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"id",
|
||||||
|
"object",
|
||||||
|
"created",
|
||||||
|
"owned_by"
|
||||||
|
],
|
||||||
|
"title": "OpenAIModel",
|
||||||
|
"description": "A model from OpenAI."
|
||||||
|
},
|
||||||
|
"OpenAIListModelsResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIModel"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"data"
|
||||||
|
],
|
||||||
|
"title": "OpenAIListModelsResponse"
|
||||||
|
},
|
||||||
"DPOAlignmentConfig": {
|
"DPOAlignmentConfig": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
665
docs/_static/llama-stack-spec.yaml
vendored
665
docs/_static/llama-stack-spec.yaml
vendored
|
@ -2131,6 +2131,91 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/LogEventRequest'
|
$ref: '#/components/schemas/LogEventRequest'
|
||||||
required: true
|
required: true
|
||||||
|
/v1/openai/v1/chat/completions:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenAIChatCompletion'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Inference
|
||||||
|
description: >-
|
||||||
|
Generate an OpenAI-compatible chat completion for the given messages using
|
||||||
|
the specified model.
|
||||||
|
parameters: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenaiChatCompletionRequest'
|
||||||
|
required: true
|
||||||
|
/v1/openai/v1/completions:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenAICompletion'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Inference
|
||||||
|
description: >-
|
||||||
|
Generate an OpenAI-compatible completion for the given prompt using the specified
|
||||||
|
model.
|
||||||
|
parameters: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenaiCompletionRequest'
|
||||||
|
required: true
|
||||||
|
/v1/openai/v1/models:
|
||||||
|
get:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: OK
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/OpenAIListModelsResponse'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Models
|
||||||
|
description: ''
|
||||||
|
parameters: []
|
||||||
/v1/post-training/preference-optimize:
|
/v1/post-training/preference-optimize:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -5980,6 +6065,586 @@ components:
|
||||||
- event
|
- event
|
||||||
- ttl_seconds
|
- ttl_seconds
|
||||||
title: LogEventRequest
|
title: LogEventRequest
|
||||||
|
OpenAIAssistantMessageParam:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
role:
|
||||||
|
type: string
|
||||||
|
const: assistant
|
||||||
|
default: assistant
|
||||||
|
description: >-
|
||||||
|
Must be "assistant" to identify this as the model's response
|
||||||
|
content:
|
||||||
|
$ref: '#/components/schemas/InterleavedContent'
|
||||||
|
description: The content of the model's response
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
(Optional) The name of the assistant message participant.
|
||||||
|
tool_calls:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/ToolCall'
|
||||||
|
description: >-
|
||||||
|
List of tool calls. Each tool call is a ToolCall object.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- role
|
||||||
|
- content
|
||||||
|
title: OpenAIAssistantMessageParam
|
||||||
|
description: >-
|
||||||
|
A message containing the model's (assistant) response in an OpenAI-compatible
|
||||||
|
chat completion request.
|
||||||
|
OpenAIDeveloperMessageParam:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
role:
|
||||||
|
type: string
|
||||||
|
const: developer
|
||||||
|
default: developer
|
||||||
|
description: >-
|
||||||
|
Must be "developer" to identify this as a developer message
|
||||||
|
content:
|
||||||
|
$ref: '#/components/schemas/InterleavedContent'
|
||||||
|
description: The content of the developer message
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
(Optional) The name of the developer message participant.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- role
|
||||||
|
- content
|
||||||
|
title: OpenAIDeveloperMessageParam
|
||||||
|
description: >-
|
||||||
|
A message from the developer in an OpenAI-compatible chat completion request.
|
||||||
|
OpenAIMessageParam:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/OpenAIUserMessageParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAISystemMessageParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAIAssistantMessageParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAIToolMessageParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
|
||||||
|
discriminator:
|
||||||
|
propertyName: role
|
||||||
|
mapping:
|
||||||
|
user: '#/components/schemas/OpenAIUserMessageParam'
|
||||||
|
system: '#/components/schemas/OpenAISystemMessageParam'
|
||||||
|
assistant: '#/components/schemas/OpenAIAssistantMessageParam'
|
||||||
|
tool: '#/components/schemas/OpenAIToolMessageParam'
|
||||||
|
developer: '#/components/schemas/OpenAIDeveloperMessageParam'
|
||||||
|
OpenAISystemMessageParam:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
role:
|
||||||
|
type: string
|
||||||
|
const: system
|
||||||
|
default: system
|
||||||
|
description: >-
|
||||||
|
Must be "system" to identify this as a system message
|
||||||
|
content:
|
||||||
|
$ref: '#/components/schemas/InterleavedContent'
|
||||||
|
description: >-
|
||||||
|
The content of the "system prompt". If multiple system messages are provided,
|
||||||
|
they are concatenated. The underlying Llama Stack code may also add other
|
||||||
|
system messages (for example, for formatting tool definitions).
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
(Optional) The name of the system message participant.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- role
|
||||||
|
- content
|
||||||
|
title: OpenAISystemMessageParam
|
||||||
|
description: >-
|
||||||
|
A system message providing instructions or context to the model.
|
||||||
|
OpenAIToolMessageParam:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
role:
|
||||||
|
type: string
|
||||||
|
const: tool
|
||||||
|
default: tool
|
||||||
|
description: >-
|
||||||
|
Must be "tool" to identify this as a tool response
|
||||||
|
tool_call_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Unique identifier for the tool call this response is for
|
||||||
|
content:
|
||||||
|
$ref: '#/components/schemas/InterleavedContent'
|
||||||
|
description: The response content from the tool
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- role
|
||||||
|
- tool_call_id
|
||||||
|
- content
|
||||||
|
title: OpenAIToolMessageParam
|
||||||
|
description: >-
|
||||||
|
A message representing the result of a tool invocation in an OpenAI-compatible
|
||||||
|
chat completion request.
|
||||||
|
OpenAIUserMessageParam:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
role:
|
||||||
|
type: string
|
||||||
|
const: user
|
||||||
|
default: user
|
||||||
|
description: >-
|
||||||
|
Must be "user" to identify this as a user message
|
||||||
|
content:
|
||||||
|
$ref: '#/components/schemas/InterleavedContent'
|
||||||
|
description: >-
|
||||||
|
The content of the message, which can include text and other media
|
||||||
|
name:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
(Optional) The name of the user message participant.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- role
|
||||||
|
- content
|
||||||
|
title: OpenAIUserMessageParam
|
||||||
|
description: >-
|
||||||
|
A message from the user in an OpenAI-compatible chat completion request.
|
||||||
|
OpenaiChatCompletionRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
The identifier of the model to use. The model must be registered with
|
||||||
|
Llama Stack and available via the /models endpoint.
|
||||||
|
messages:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAIMessageParam'
|
||||||
|
description: List of messages in the conversation
|
||||||
|
frequency_penalty:
|
||||||
|
type: number
|
||||||
|
description: >-
|
||||||
|
(Optional) The penalty for repeated tokens
|
||||||
|
function_call:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) The function call to use
|
||||||
|
functions:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) List of functions to use
|
||||||
|
logit_bias:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The logit bias to use
|
||||||
|
logprobs:
|
||||||
|
type: boolean
|
||||||
|
description: (Optional) The log probabilities to use
|
||||||
|
max_completion_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The maximum number of tokens to generate
|
||||||
|
max_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The maximum number of tokens to generate
|
||||||
|
n:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The number of completions to generate
|
||||||
|
parallel_tool_calls:
|
||||||
|
type: boolean
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to parallelize tool calls
|
||||||
|
presence_penalty:
|
||||||
|
type: number
|
||||||
|
description: >-
|
||||||
|
(Optional) The penalty for repeated tokens
|
||||||
|
response_format:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: (Optional) The response format to use
|
||||||
|
seed:
|
||||||
|
type: integer
|
||||||
|
description: (Optional) The seed to use
|
||||||
|
stop:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
description: (Optional) The stop tokens to use
|
||||||
|
stream:
|
||||||
|
type: boolean
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to stream the response
|
||||||
|
stream_options:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) The stream options to use
|
||||||
|
temperature:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The temperature to use
|
||||||
|
tool_choice:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) The tool choice to use
|
||||||
|
tools:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) The tools to use
|
||||||
|
top_logprobs:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The top log probabilities to use
|
||||||
|
top_p:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The top p to use
|
||||||
|
user:
|
||||||
|
type: string
|
||||||
|
description: (Optional) The user to use
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- model
|
||||||
|
- messages
|
||||||
|
title: OpenaiChatCompletionRequest
|
||||||
|
OpenAIChatCompletion:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
description: The ID of the chat completion
|
||||||
|
choices:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAIChoice'
|
||||||
|
description: List of choices
|
||||||
|
object:
|
||||||
|
type: string
|
||||||
|
const: chat.completion
|
||||||
|
default: chat.completion
|
||||||
|
description: >-
|
||||||
|
The object type, which will be "chat.completion"
|
||||||
|
created:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
The Unix timestamp in seconds when the chat completion was created
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
The model that was used to generate the chat completion
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- choices
|
||||||
|
- object
|
||||||
|
- created
|
||||||
|
- model
|
||||||
|
title: OpenAIChatCompletion
|
||||||
|
description: >-
|
||||||
|
Response from an OpenAI-compatible chat completion request.
|
||||||
|
OpenAIChoice:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
message:
|
||||||
|
$ref: '#/components/schemas/OpenAIMessageParam'
|
||||||
|
description: The message from the model
|
||||||
|
finish_reason:
|
||||||
|
type: string
|
||||||
|
description: The reason the model stopped generating
|
||||||
|
index:
|
||||||
|
type: integer
|
||||||
|
logprobs:
|
||||||
|
$ref: '#/components/schemas/OpenAIChoiceLogprobs'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- message
|
||||||
|
- finish_reason
|
||||||
|
- index
|
||||||
|
title: OpenAIChoice
|
||||||
|
description: >-
|
||||||
|
A choice from an OpenAI-compatible chat completion response.
|
||||||
|
OpenAIChoiceLogprobs:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
content:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAITokenLogProb'
|
||||||
|
refusal:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAITokenLogProb'
|
||||||
|
additionalProperties: false
|
||||||
|
title: OpenAIChoiceLogprobs
|
||||||
|
description: >-
|
||||||
|
The log probabilities for the tokens in the message from an OpenAI-compatible
|
||||||
|
chat completion response.
|
||||||
|
OpenAITokenLogProb:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
token:
|
||||||
|
type: string
|
||||||
|
bytes:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: integer
|
||||||
|
logprob:
|
||||||
|
type: number
|
||||||
|
top_logprobs:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAITopLogProb'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- token
|
||||||
|
- logprob
|
||||||
|
- top_logprobs
|
||||||
|
title: OpenAITokenLogProb
|
||||||
|
description: >-
|
||||||
|
The log probability for a token from an OpenAI-compatible chat completion
|
||||||
|
response.
|
||||||
|
OpenAITopLogProb:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
token:
|
||||||
|
type: string
|
||||||
|
bytes:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: integer
|
||||||
|
logprob:
|
||||||
|
type: number
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- token
|
||||||
|
- logprob
|
||||||
|
title: OpenAITopLogProb
|
||||||
|
description: >-
|
||||||
|
The top log probability for a token from an OpenAI-compatible chat completion
|
||||||
|
response.
|
||||||
|
OpenaiCompletionRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
The identifier of the model to use. The model must be registered with
|
||||||
|
Llama Stack and available via the /models endpoint.
|
||||||
|
prompt:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
- type: array
|
||||||
|
items:
|
||||||
|
type: integer
|
||||||
|
- type: array
|
||||||
|
items:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: integer
|
||||||
|
description: The prompt to generate a completion for
|
||||||
|
best_of:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The number of completions to generate
|
||||||
|
echo:
|
||||||
|
type: boolean
|
||||||
|
description: (Optional) Whether to echo the prompt
|
||||||
|
frequency_penalty:
|
||||||
|
type: number
|
||||||
|
description: >-
|
||||||
|
(Optional) The penalty for repeated tokens
|
||||||
|
logit_bias:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The logit bias to use
|
||||||
|
logprobs:
|
||||||
|
type: boolean
|
||||||
|
description: (Optional) The log probabilities to use
|
||||||
|
max_tokens:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The maximum number of tokens to generate
|
||||||
|
n:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) The number of completions to generate
|
||||||
|
presence_penalty:
|
||||||
|
type: number
|
||||||
|
description: >-
|
||||||
|
(Optional) The penalty for repeated tokens
|
||||||
|
seed:
|
||||||
|
type: integer
|
||||||
|
description: (Optional) The seed to use
|
||||||
|
stop:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
description: (Optional) The stop tokens to use
|
||||||
|
stream:
|
||||||
|
type: boolean
|
||||||
|
description: >-
|
||||||
|
(Optional) Whether to stream the response
|
||||||
|
stream_options:
|
||||||
|
type: object
|
||||||
|
additionalProperties:
|
||||||
|
oneOf:
|
||||||
|
- type: 'null'
|
||||||
|
- type: boolean
|
||||||
|
- type: number
|
||||||
|
- type: string
|
||||||
|
- type: array
|
||||||
|
- type: object
|
||||||
|
description: (Optional) The stream options to use
|
||||||
|
temperature:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The temperature to use
|
||||||
|
top_p:
|
||||||
|
type: number
|
||||||
|
description: (Optional) The top p to use
|
||||||
|
user:
|
||||||
|
type: string
|
||||||
|
description: (Optional) The user to use
|
||||||
|
guided_choice:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
prompt_logprobs:
|
||||||
|
type: integer
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- model
|
||||||
|
- prompt
|
||||||
|
title: OpenaiCompletionRequest
|
||||||
|
OpenAICompletion:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
choices:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAICompletionChoice'
|
||||||
|
created:
|
||||||
|
type: integer
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
object:
|
||||||
|
type: string
|
||||||
|
const: text_completion
|
||||||
|
default: text_completion
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- choices
|
||||||
|
- created
|
||||||
|
- model
|
||||||
|
- object
|
||||||
|
title: OpenAICompletion
|
||||||
|
description: >-
|
||||||
|
Response from an OpenAI-compatible completion request.
|
||||||
|
OpenAICompletionChoice:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
finish_reason:
|
||||||
|
type: string
|
||||||
|
text:
|
||||||
|
type: string
|
||||||
|
index:
|
||||||
|
type: integer
|
||||||
|
logprobs:
|
||||||
|
$ref: '#/components/schemas/OpenAIChoiceLogprobs'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- finish_reason
|
||||||
|
- text
|
||||||
|
- index
|
||||||
|
title: OpenAICompletionChoice
|
||||||
|
description: >-
|
||||||
|
A choice from an OpenAI-compatible completion response.
|
||||||
|
OpenAIModel:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
object:
|
||||||
|
type: string
|
||||||
|
const: model
|
||||||
|
default: model
|
||||||
|
created:
|
||||||
|
type: integer
|
||||||
|
owned_by:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- id
|
||||||
|
- object
|
||||||
|
- created
|
||||||
|
- owned_by
|
||||||
|
title: OpenAIModel
|
||||||
|
description: A model from OpenAI.
|
||||||
|
OpenAIListModelsResponse:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
data:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/OpenAIModel'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- data
|
||||||
|
title: OpenAIListModelsResponse
|
||||||
DPOAlignmentConfig:
|
DPOAlignmentConfig:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
|
@ -442,6 +442,217 @@ class EmbeddingsResponse(BaseModel):
|
||||||
embeddings: List[List[float]]
|
embeddings: List[List[float]]
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIUserMessageParam(BaseModel):
|
||||||
|
"""A message from the user in an OpenAI-compatible chat completion request.
|
||||||
|
|
||||||
|
:param role: Must be "user" to identify this as a user message
|
||||||
|
:param content: The content of the message, which can include text and other media
|
||||||
|
:param name: (Optional) The name of the user message participant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Literal["user"] = "user"
|
||||||
|
content: InterleavedContent
|
||||||
|
name: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAISystemMessageParam(BaseModel):
|
||||||
|
"""A system message providing instructions or context to the model.
|
||||||
|
|
||||||
|
:param role: Must be "system" to identify this as a system message
|
||||||
|
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
|
||||||
|
:param name: (Optional) The name of the system message participant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Literal["system"] = "system"
|
||||||
|
content: InterleavedContent
|
||||||
|
name: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIAssistantMessageParam(BaseModel):
|
||||||
|
"""A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.
|
||||||
|
|
||||||
|
:param role: Must be "assistant" to identify this as the model's response
|
||||||
|
:param content: The content of the model's response
|
||||||
|
:param name: (Optional) The name of the assistant message participant.
|
||||||
|
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Literal["assistant"] = "assistant"
|
||||||
|
content: InterleavedContent
|
||||||
|
name: Optional[str] = None
|
||||||
|
tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIToolMessageParam(BaseModel):
|
||||||
|
"""A message representing the result of a tool invocation in an OpenAI-compatible chat completion request.
|
||||||
|
|
||||||
|
:param role: Must be "tool" to identify this as a tool response
|
||||||
|
:param tool_call_id: Unique identifier for the tool call this response is for
|
||||||
|
:param content: The response content from the tool
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Literal["tool"] = "tool"
|
||||||
|
tool_call_id: str
|
||||||
|
content: InterleavedContent
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIDeveloperMessageParam(BaseModel):
|
||||||
|
"""A message from the developer in an OpenAI-compatible chat completion request.
|
||||||
|
|
||||||
|
:param role: Must be "developer" to identify this as a developer message
|
||||||
|
:param content: The content of the developer message
|
||||||
|
:param name: (Optional) The name of the developer message participant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
role: Literal["developer"] = "developer"
|
||||||
|
content: InterleavedContent
|
||||||
|
name: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
OpenAIMessageParam = Annotated[
|
||||||
|
Union[
|
||||||
|
OpenAIUserMessageParam,
|
||||||
|
OpenAISystemMessageParam,
|
||||||
|
OpenAIAssistantMessageParam,
|
||||||
|
OpenAIToolMessageParam,
|
||||||
|
OpenAIDeveloperMessageParam,
|
||||||
|
],
|
||||||
|
Field(discriminator="role"),
|
||||||
|
]
|
||||||
|
register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAITopLogProb(BaseModel):
|
||||||
|
"""The top log probability for a token from an OpenAI-compatible chat completion response.
|
||||||
|
|
||||||
|
:token: The token
|
||||||
|
:bytes: (Optional) The bytes for the token
|
||||||
|
:logprob: The log probability of the token
|
||||||
|
"""
|
||||||
|
|
||||||
|
token: str
|
||||||
|
bytes: Optional[List[int]] = None
|
||||||
|
logprob: float
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAITokenLogProb(BaseModel):
|
||||||
|
"""The log probability for a token from an OpenAI-compatible chat completion response.
|
||||||
|
|
||||||
|
:token: The token
|
||||||
|
:bytes: (Optional) The bytes for the token
|
||||||
|
:logprob: The log probability of the token
|
||||||
|
:top_logprobs: The top log probabilities for the token
|
||||||
|
"""
|
||||||
|
|
||||||
|
token: str
|
||||||
|
bytes: Optional[List[int]] = None
|
||||||
|
logprob: float
|
||||||
|
top_logprobs: List[OpenAITopLogProb]
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIChoiceLogprobs(BaseModel):
|
||||||
|
"""The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.
|
||||||
|
|
||||||
|
:content: (Optional) The log probabilities for the tokens in the message
|
||||||
|
:refusal: (Optional) The log probabilities for the tokens in the message
|
||||||
|
"""
|
||||||
|
|
||||||
|
content: Optional[List[OpenAITokenLogProb]] = None
|
||||||
|
refusal: Optional[List[OpenAITokenLogProb]] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIChoice(BaseModel):
|
||||||
|
"""A choice from an OpenAI-compatible chat completion response.
|
||||||
|
|
||||||
|
:param message: The message from the model
|
||||||
|
:param finish_reason: The reason the model stopped generating
|
||||||
|
:index: The index of the choice
|
||||||
|
:logprobs: (Optional) The log probabilities for the tokens in the message
|
||||||
|
"""
|
||||||
|
|
||||||
|
message: OpenAIMessageParam
|
||||||
|
finish_reason: str
|
||||||
|
index: int
|
||||||
|
logprobs: Optional[OpenAIChoiceLogprobs] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIChatCompletion(BaseModel):
|
||||||
|
"""Response from an OpenAI-compatible chat completion request.
|
||||||
|
|
||||||
|
:param id: The ID of the chat completion
|
||||||
|
:param choices: List of choices
|
||||||
|
:param object: The object type, which will be "chat.completion"
|
||||||
|
:param created: The Unix timestamp in seconds when the chat completion was created
|
||||||
|
:param model: The model that was used to generate the chat completion
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
choices: List[OpenAIChoice]
|
||||||
|
object: Literal["chat.completion"] = "chat.completion"
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAICompletionLogprobs(BaseModel):
|
||||||
|
"""The log probabilities for the tokens in the message from an OpenAI-compatible completion response.
|
||||||
|
|
||||||
|
:text_offset: (Optional) The offset of the token in the text
|
||||||
|
:token_logprobs: (Optional) The log probabilities for the tokens
|
||||||
|
:tokens: (Optional) The tokens
|
||||||
|
:top_logprobs: (Optional) The top log probabilities for the tokens
|
||||||
|
"""
|
||||||
|
|
||||||
|
text_offset: Optional[List[int]] = None
|
||||||
|
token_logprobs: Optional[List[float]] = None
|
||||||
|
tokens: Optional[List[str]] = None
|
||||||
|
top_logprobs: Optional[List[Dict[str, float]]] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAICompletionChoice(BaseModel):
|
||||||
|
"""A choice from an OpenAI-compatible completion response.
|
||||||
|
|
||||||
|
:finish_reason: The reason the model stopped generating
|
||||||
|
:text: The text of the choice
|
||||||
|
:index: The index of the choice
|
||||||
|
:logprobs: (Optional) The log probabilities for the tokens in the choice
|
||||||
|
"""
|
||||||
|
|
||||||
|
finish_reason: str
|
||||||
|
text: str
|
||||||
|
index: int
|
||||||
|
logprobs: Optional[OpenAIChoiceLogprobs] = None
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAICompletion(BaseModel):
|
||||||
|
"""Response from an OpenAI-compatible completion request.
|
||||||
|
|
||||||
|
:id: The ID of the completion
|
||||||
|
:choices: List of choices
|
||||||
|
:created: The Unix timestamp in seconds when the completion was created
|
||||||
|
:model: The model that was used to generate the completion
|
||||||
|
:object: The object type, which will be "text_completion"
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
choices: List[OpenAICompletionChoice]
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
object: Literal["text_completion"] = "text_completion"
|
||||||
|
|
||||||
|
|
||||||
class ModelStore(Protocol):
|
class ModelStore(Protocol):
|
||||||
async def get_model(self, identifier: str) -> Model: ...
|
async def get_model(self, identifier: str) -> Model: ...
|
||||||
|
|
||||||
|
@ -564,3 +775,105 @@ class Inference(Protocol):
|
||||||
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
|
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/completions", method="POST")
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
# Standard OpenAI completion parameters
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
# vLLM-specific parameters
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
|
||||||
|
|
||||||
|
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
|
||||||
|
:param prompt: The prompt to generate a completion for
|
||||||
|
:param best_of: (Optional) The number of completions to generate
|
||||||
|
:param echo: (Optional) Whether to echo the prompt
|
||||||
|
:param frequency_penalty: (Optional) The penalty for repeated tokens
|
||||||
|
:param logit_bias: (Optional) The logit bias to use
|
||||||
|
:param logprobs: (Optional) The log probabilities to use
|
||||||
|
:param max_tokens: (Optional) The maximum number of tokens to generate
|
||||||
|
:param n: (Optional) The number of completions to generate
|
||||||
|
:param presence_penalty: (Optional) The penalty for repeated tokens
|
||||||
|
:param seed: (Optional) The seed to use
|
||||||
|
:param stop: (Optional) The stop tokens to use
|
||||||
|
:param stream: (Optional) Whether to stream the response
|
||||||
|
:param stream_options: (Optional) The stream options to use
|
||||||
|
:param temperature: (Optional) The temperature to use
|
||||||
|
:param top_p: (Optional) The top p to use
|
||||||
|
:param user: (Optional) The user to use
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/chat/completions", method="POST")
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model.
|
||||||
|
|
||||||
|
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
|
||||||
|
:param messages: List of messages in the conversation
|
||||||
|
:param frequency_penalty: (Optional) The penalty for repeated tokens
|
||||||
|
:param function_call: (Optional) The function call to use
|
||||||
|
:param functions: (Optional) List of functions to use
|
||||||
|
:param logit_bias: (Optional) The logit bias to use
|
||||||
|
:param logprobs: (Optional) The log probabilities to use
|
||||||
|
:param max_completion_tokens: (Optional) The maximum number of tokens to generate
|
||||||
|
:param max_tokens: (Optional) The maximum number of tokens to generate
|
||||||
|
:param n: (Optional) The number of completions to generate
|
||||||
|
:param parallel_tool_calls: (Optional) Whether to parallelize tool calls
|
||||||
|
:param presence_penalty: (Optional) The penalty for repeated tokens
|
||||||
|
:param response_format: (Optional) The response format to use
|
||||||
|
:param seed: (Optional) The seed to use
|
||||||
|
:param stop: (Optional) The stop tokens to use
|
||||||
|
:param stream: (Optional) Whether to stream the response
|
||||||
|
:param stream_options: (Optional) The stream options to use
|
||||||
|
:param temperature: (Optional) The temperature to use
|
||||||
|
:param tool_choice: (Optional) The tool choice to use
|
||||||
|
:param tools: (Optional) The tools to use
|
||||||
|
:param top_logprobs: (Optional) The top log probabilities to use
|
||||||
|
:param top_p: (Optional) The top p to use
|
||||||
|
:param user: (Optional) The user to use
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
|
@ -56,12 +56,35 @@ class ListModelsResponse(BaseModel):
|
||||||
data: List[Model]
|
data: List[Model]
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIModel(BaseModel):
|
||||||
|
"""A model from OpenAI.
|
||||||
|
|
||||||
|
:id: The ID of the model
|
||||||
|
:object: The object type, which will be "model"
|
||||||
|
:created: The Unix timestamp in seconds when the model was created
|
||||||
|
:owned_by: The owner of the model
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
object: Literal["model"] = "model"
|
||||||
|
created: int
|
||||||
|
owned_by: str
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIListModelsResponse(BaseModel):
|
||||||
|
data: List[OpenAIModel]
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
@trace_protocol
|
@trace_protocol
|
||||||
class Models(Protocol):
|
class Models(Protocol):
|
||||||
@webmethod(route="/models", method="GET")
|
@webmethod(route="/models", method="GET")
|
||||||
async def list_models(self) -> ListModelsResponse: ...
|
async def list_models(self) -> ListModelsResponse: ...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/models", method="GET")
|
||||||
|
async def openai_list_models(self) -> OpenAIListModelsResponse: ...
|
||||||
|
|
||||||
@webmethod(route="/models/{model_id:path}", method="GET")
|
@webmethod(route="/models/{model_id:path}", method="GET")
|
||||||
async def get_model(
|
async def get_model(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.apis.models import Model, ModelType
|
from llama_stack.apis.models import Model, ModelType
|
||||||
from llama_stack.apis.safety import RunShieldResponse, Safety
|
from llama_stack.apis.safety import RunShieldResponse, Safety
|
||||||
from llama_stack.apis.scoring import (
|
from llama_stack.apis.scoring import (
|
||||||
|
@ -419,6 +420,126 @@ class InferenceRouter(Inference):
|
||||||
task_type=task_type,
|
task_type=task_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
logger.debug(
|
||||||
|
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
||||||
|
)
|
||||||
|
model_obj = await self.routing_table.get_model(model)
|
||||||
|
if model_obj is None:
|
||||||
|
raise ValueError(f"Model '{model}' not found")
|
||||||
|
if model_obj.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
|
||||||
|
|
||||||
|
params = dict(
|
||||||
|
model=model_obj.identifier,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
guided_choice=guided_choice,
|
||||||
|
prompt_logprobs=prompt_logprobs,
|
||||||
|
)
|
||||||
|
|
||||||
|
provider = self.routing_table.get_provider_impl(model_obj.identifier)
|
||||||
|
return await provider.openai_completion(**params)
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
logger.debug(
|
||||||
|
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
|
||||||
|
)
|
||||||
|
model_obj = await self.routing_table.get_model(model)
|
||||||
|
if model_obj is None:
|
||||||
|
raise ValueError(f"Model '{model}' not found")
|
||||||
|
if model_obj.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
|
||||||
|
|
||||||
|
params = dict(
|
||||||
|
model=model_obj.identifier,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
provider = self.routing_table.get_provider_impl(model_obj.identifier)
|
||||||
|
return await provider.openai_chat_completion(**params)
|
||||||
|
|
||||||
|
|
||||||
class SafetyRouter(Safety):
|
class SafetyRouter(Safety):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
@ -23,7 +24,7 @@ from llama_stack.apis.datasets import (
|
||||||
RowsDataSource,
|
RowsDataSource,
|
||||||
URIDataSource,
|
URIDataSource,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
|
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
|
||||||
from llama_stack.apis.resource import ResourceType
|
from llama_stack.apis.resource import ResourceType
|
||||||
from llama_stack.apis.scoring_functions import (
|
from llama_stack.apis.scoring_functions import (
|
||||||
ListScoringFunctionsResponse,
|
ListScoringFunctionsResponse,
|
||||||
|
@ -254,6 +255,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
||||||
async def list_models(self) -> ListModelsResponse:
|
async def list_models(self) -> ListModelsResponse:
|
||||||
return ListModelsResponse(data=await self.get_all_with_type("model"))
|
return ListModelsResponse(data=await self.get_all_with_type("model"))
|
||||||
|
|
||||||
|
async def openai_list_models(self) -> OpenAIListModelsResponse:
|
||||||
|
models = await self.get_all_with_type("model")
|
||||||
|
openai_models = [
|
||||||
|
OpenAIModel(
|
||||||
|
id=model.identifier,
|
||||||
|
object="model",
|
||||||
|
created=int(time.time()),
|
||||||
|
owned_by="llama_stack",
|
||||||
|
)
|
||||||
|
for model in models
|
||||||
|
]
|
||||||
|
return OpenAIListModelsResponse(data=openai_models)
|
||||||
|
|
||||||
async def get_model(self, model_id: str) -> Model:
|
async def get_model(self, model_id: str) -> Model:
|
||||||
model = await self.get_object_by_identifier("model", model_id)
|
model = await self.get_object_by_identifier("model", model_id)
|
||||||
if model is None:
|
if model is None:
|
||||||
|
|
|
@ -54,6 +54,10 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelRegistryHelper,
|
ModelRegistryHelper,
|
||||||
build_hf_repo_model_entry,
|
build_hf_repo_model_entry,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
)
|
||||||
from llama_stack.providers.utils.inference.prompt_adapter import (
|
from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
augment_content_with_response_format_prompt,
|
augment_content_with_response_format_prompt,
|
||||||
chat_completion_request_to_messages,
|
chat_completion_request_to_messages,
|
||||||
|
@ -79,6 +83,8 @@ def llama4_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama
|
||||||
|
|
||||||
|
|
||||||
class MetaReferenceInferenceImpl(
|
class MetaReferenceInferenceImpl(
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
SentenceTransformerEmbeddingMixin,
|
SentenceTransformerEmbeddingMixin,
|
||||||
Inference,
|
Inference,
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
|
|
|
@ -23,6 +23,10 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
|
||||||
from llama_stack.providers.utils.inference.embedding_mixin import (
|
from llama_stack.providers.utils.inference.embedding_mixin import (
|
||||||
SentenceTransformerEmbeddingMixin,
|
SentenceTransformerEmbeddingMixin,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
)
|
||||||
|
|
||||||
from .config import SentenceTransformersInferenceConfig
|
from .config import SentenceTransformersInferenceConfig
|
||||||
|
|
||||||
|
@ -30,6 +34,8 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class SentenceTransformersInferenceImpl(
|
class SentenceTransformersInferenceImpl(
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
SentenceTransformerEmbeddingMixin,
|
SentenceTransformerEmbeddingMixin,
|
||||||
Inference,
|
Inference,
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
|
|
|
@ -66,8 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelsProtocolPrivate,
|
ModelsProtocolPrivate,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
OpenAICompatCompletionChoice,
|
OpenAICompatCompletionChoice,
|
||||||
OpenAICompatCompletionResponse,
|
OpenAICompatCompletionResponse,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_stop_reason,
|
get_stop_reason,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
)
|
)
|
||||||
|
@ -172,7 +174,12 @@ def _convert_sampling_params(
|
||||||
return vllm_sampling_params
|
return vllm_sampling_params
|
||||||
|
|
||||||
|
|
||||||
class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
class VLLMInferenceImpl(
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
ModelsProtocolPrivate,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
vLLM-based inference model adapter for Llama Stack with support for multiple models.
|
vLLM-based inference model adapter for Llama Stack with support for multiple models.
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelRegistryHelper,
|
ModelRegistryHelper,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
OpenAICompatCompletionChoice,
|
OpenAICompatCompletionChoice,
|
||||||
OpenAICompatCompletionResponse,
|
OpenAICompatCompletionResponse,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_sampling_strategy_options,
|
get_sampling_strategy_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
|
@ -51,7 +53,12 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
from .models import MODEL_ENTRIES
|
from .models import MODEL_ENTRIES
|
||||||
|
|
||||||
|
|
||||||
class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
|
class BedrockInferenceAdapter(
|
||||||
|
ModelRegistryHelper,
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
):
|
||||||
def __init__(self, config: BedrockConfig) -> None:
|
def __init__(self, config: BedrockConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
|
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
|
||||||
self._config = config
|
self._config = config
|
||||||
|
|
|
@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelRegistryHelper,
|
ModelRegistryHelper,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
|
@ -49,7 +51,12 @@ from .config import CerebrasImplConfig
|
||||||
from .models import MODEL_ENTRIES
|
from .models import MODEL_ENTRIES
|
||||||
|
|
||||||
|
|
||||||
class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
|
class CerebrasInferenceAdapter(
|
||||||
|
ModelRegistryHelper,
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
):
|
||||||
def __init__(self, config: CerebrasImplConfig) -> None:
|
def __init__(self, config: CerebrasImplConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(
|
ModelRegistryHelper.__init__(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
build_hf_repo_model_entry,
|
build_hf_repo_model_entry,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
|
@ -56,7 +58,12 @@ model_entries = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
|
class DatabricksInferenceAdapter(
|
||||||
|
ModelRegistryHelper,
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
):
|
||||||
def __init__(self, config: DatabricksImplConfig) -> None:
|
def __init__(self, config: DatabricksImplConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(self, model_entries=model_entries)
|
ModelRegistryHelper.__init__(self, model_entries=model_entries)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
|
@ -4,9 +4,10 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from typing import AsyncGenerator, List, Optional, Union
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||||
|
|
||||||
from fireworks.client import Fireworks
|
from fireworks.client import Fireworks
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import (
|
from llama_stack.apis.common.content_types import (
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
|
@ -31,6 +32,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.inference.model_registry import (
|
from llama_stack.providers.utils.inference.model_registry import (
|
||||||
|
@ -39,6 +41,7 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
convert_message_to_openai_dict,
|
convert_message_to_openai_dict,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
|
prepare_openai_completion_params,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
process_completion_response,
|
process_completion_response,
|
||||||
|
@ -81,10 +84,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
|
||||||
)
|
)
|
||||||
return provider_data.fireworks_api_key
|
return provider_data.fireworks_api_key
|
||||||
|
|
||||||
|
def _get_base_url(self) -> str:
|
||||||
|
return "https://api.fireworks.ai/inference/v1"
|
||||||
|
|
||||||
def _get_client(self) -> Fireworks:
|
def _get_client(self) -> Fireworks:
|
||||||
fireworks_api_key = self._get_api_key()
|
fireworks_api_key = self._get_api_key()
|
||||||
return Fireworks(api_key=fireworks_api_key)
|
return Fireworks(api_key=fireworks_api_key)
|
||||||
|
|
||||||
|
def _get_openai_client(self) -> AsyncOpenAI:
|
||||||
|
return AsyncOpenAI(base_url=self._get_base_url(), api_key=self._get_api_key())
|
||||||
|
|
||||||
async def completion(
|
async def completion(
|
||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
@ -268,3 +277,101 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
|
||||||
|
|
||||||
embeddings = [data.embedding for data in response.data]
|
embeddings = [data.embedding for data in response.data]
|
||||||
return EmbeddingsResponse(embeddings=embeddings)
|
return EmbeddingsResponse(embeddings=embeddings)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return await self._get_openai_client().completions.create(**params)
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return await self._get_openai_client().chat.completions.create(**params)
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import AsyncIterator, List, Optional, Union
|
from typing import Any, AsyncIterator, Dict, List, Optional, Union
|
||||||
|
|
||||||
from openai import APIConnectionError, AsyncOpenAI, BadRequestError
|
from openai import APIConnectionError, AsyncOpenAI, BadRequestError
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolConfig,
|
ToolConfig,
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.models.llama.datatypes import ToolPromptFormat
|
from llama_stack.models.llama.datatypes import ToolPromptFormat
|
||||||
from llama_stack.providers.utils.inference.model_registry import (
|
from llama_stack.providers.utils.inference.model_registry import (
|
||||||
ModelRegistryHelper,
|
ModelRegistryHelper,
|
||||||
|
@ -42,6 +43,7 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
convert_openai_chat_completion_choice,
|
convert_openai_chat_completion_choice,
|
||||||
convert_openai_chat_completion_stream,
|
convert_openai_chat_completion_stream,
|
||||||
|
prepare_openai_completion_params,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.prompt_adapter import content_has_media
|
from llama_stack.providers.utils.inference.prompt_adapter import content_has_media
|
||||||
|
|
||||||
|
@ -263,3 +265,111 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
|
||||||
else:
|
else:
|
||||||
# we pass n=1 to get only one completion
|
# we pass n=1 to get only one completion
|
||||||
return convert_openai_chat_completion_choice(response.choices[0])
|
return convert_openai_chat_completion_choice(response.choices[0])
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
provider_model_id = self.get_provider_model_id(model)
|
||||||
|
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=provider_model_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await self._get_client(provider_model_id).completions.create(**params)
|
||||||
|
except APIConnectionError as e:
|
||||||
|
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
provider_model_id = self.get_provider_model_id(model)
|
||||||
|
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=provider_model_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await self._get_client(provider_model_id).chat.completions.create(**params)
|
||||||
|
except APIConnectionError as e:
|
||||||
|
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
|
||||||
|
|
|
@ -5,10 +5,11 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
|
||||||
from typing import Any, AsyncGenerator, List, Optional, Union
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from ollama import AsyncClient
|
from ollama import AsyncClient
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import (
|
from llama_stack.apis.common.content_types import (
|
||||||
ImageContentItem,
|
ImageContentItem,
|
||||||
|
@ -38,6 +39,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.apis.models import Model, ModelType
|
from llama_stack.apis.models import Model, ModelType
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
||||||
|
@ -67,7 +69,10 @@ from .models import model_entries
|
||||||
logger = get_logger(name=__name__, category="inference")
|
logger = get_logger(name=__name__, category="inference")
|
||||||
|
|
||||||
|
|
||||||
class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
class OllamaInferenceAdapter(
|
||||||
|
Inference,
|
||||||
|
ModelsProtocolPrivate,
|
||||||
|
):
|
||||||
def __init__(self, url: str) -> None:
|
def __init__(self, url: str) -> None:
|
||||||
self.register_helper = ModelRegistryHelper(model_entries)
|
self.register_helper = ModelRegistryHelper(model_entries)
|
||||||
self.url = url
|
self.url = url
|
||||||
|
@ -76,6 +81,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
def client(self) -> AsyncClient:
|
def client(self) -> AsyncClient:
|
||||||
return AsyncClient(host=self.url)
|
return AsyncClient(host=self.url)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def openai_client(self) -> AsyncOpenAI:
|
||||||
|
return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
logger.info(f"checking connectivity to Ollama at `{self.url}`...")
|
logger.info(f"checking connectivity to Ollama at `{self.url}`...")
|
||||||
try:
|
try:
|
||||||
|
@ -319,6 +328,115 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
if not isinstance(prompt, str):
|
||||||
|
raise ValueError("Ollama does not support non-string prompts for completion")
|
||||||
|
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
params = {
|
||||||
|
k: v
|
||||||
|
for k, v in {
|
||||||
|
"model": model_obj.provider_resource_id,
|
||||||
|
"prompt": prompt,
|
||||||
|
"best_of": best_of,
|
||||||
|
"echo": echo,
|
||||||
|
"frequency_penalty": frequency_penalty,
|
||||||
|
"logit_bias": logit_bias,
|
||||||
|
"logprobs": logprobs,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"n": n,
|
||||||
|
"presence_penalty": presence_penalty,
|
||||||
|
"seed": seed,
|
||||||
|
"stop": stop,
|
||||||
|
"stream": stream,
|
||||||
|
"stream_options": stream_options,
|
||||||
|
"temperature": temperature,
|
||||||
|
"top_p": top_p,
|
||||||
|
"user": user,
|
||||||
|
}.items()
|
||||||
|
if v is not None
|
||||||
|
}
|
||||||
|
return await self.openai_client.completions.create(**params) # type: ignore
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
params = {
|
||||||
|
k: v
|
||||||
|
for k, v in {
|
||||||
|
"model": model_obj.provider_resource_id,
|
||||||
|
"messages": messages,
|
||||||
|
"frequency_penalty": frequency_penalty,
|
||||||
|
"function_call": function_call,
|
||||||
|
"functions": functions,
|
||||||
|
"logit_bias": logit_bias,
|
||||||
|
"logprobs": logprobs,
|
||||||
|
"max_completion_tokens": max_completion_tokens,
|
||||||
|
"max_tokens": max_tokens,
|
||||||
|
"n": n,
|
||||||
|
"parallel_tool_calls": parallel_tool_calls,
|
||||||
|
"presence_penalty": presence_penalty,
|
||||||
|
"response_format": response_format,
|
||||||
|
"seed": seed,
|
||||||
|
"stop": stop,
|
||||||
|
"stream": stream,
|
||||||
|
"stream_options": stream_options,
|
||||||
|
"temperature": temperature,
|
||||||
|
"tool_choice": tool_choice,
|
||||||
|
"tools": tools,
|
||||||
|
"top_logprobs": top_logprobs,
|
||||||
|
"top_p": top_p,
|
||||||
|
"user": user,
|
||||||
|
}.items()
|
||||||
|
if v is not None
|
||||||
|
}
|
||||||
|
return await self.openai_client.chat.completions.create(**params) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
|
async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
|
||||||
async def _convert_content(content) -> dict:
|
async def _convert_content(content) -> dict:
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from typing import Any, AsyncGenerator, Dict, List, Optional
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||||
|
|
||||||
from llama_stack_client import AsyncLlamaStackClient
|
from llama_stack_client import AsyncLlamaStackClient
|
||||||
|
|
||||||
|
@ -26,9 +26,11 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.apis.models import Model
|
from llama_stack.apis.models import Model
|
||||||
from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
|
from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||||
|
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
||||||
|
|
||||||
from .config import PassthroughImplConfig
|
from .config import PassthroughImplConfig
|
||||||
|
|
||||||
|
@ -201,6 +203,112 @@ class PassthroughInferenceAdapter(Inference):
|
||||||
task_type=task_type,
|
task_type=task_type,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
client = self._get_client()
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
guided_choice=guided_choice,
|
||||||
|
prompt_logprobs=prompt_logprobs,
|
||||||
|
)
|
||||||
|
|
||||||
|
return await client.inference.openai_completion(**params)
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
client = self._get_client()
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
|
||||||
|
return await client.inference.openai_chat_completion(**params)
|
||||||
|
|
||||||
def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
|
def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
json_params = {}
|
json_params = {}
|
||||||
for key, value in request_params.items():
|
for key, value in request_params.items():
|
||||||
|
|
|
@ -12,6 +12,8 @@ from llama_stack.apis.inference import * # noqa: F403
|
||||||
# from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
# from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
|
@ -38,7 +40,12 @@ RUNPOD_SUPPORTED_MODELS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
|
class RunpodInferenceAdapter(
|
||||||
|
ModelRegistryHelper,
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
):
|
||||||
def __init__(self, config: RunpodImplConfig) -> None:
|
def __init__(self, config: RunpodImplConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
|
ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
|
@ -42,6 +42,8 @@ from llama_stack.apis.inference import (
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.prompt_adapter import (
|
from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
|
@ -52,7 +54,12 @@ from .config import SambaNovaImplConfig
|
||||||
from .models import MODEL_ENTRIES
|
from .models import MODEL_ENTRIES
|
||||||
|
|
||||||
|
|
||||||
class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
|
class SambaNovaInferenceAdapter(
|
||||||
|
ModelRegistryHelper,
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
):
|
||||||
def __init__(self, config: SambaNovaImplConfig) -> None:
|
def __init__(self, config: SambaNovaImplConfig) -> None:
|
||||||
ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
|
ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
|
@ -40,8 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import (
|
||||||
build_hf_repo_model_entry,
|
build_hf_repo_model_entry,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
OpenAICompatCompletionChoice,
|
OpenAICompatCompletionChoice,
|
||||||
OpenAICompatCompletionResponse,
|
OpenAICompatCompletionResponse,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
|
@ -69,7 +71,12 @@ def build_hf_repo_model_entries():
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class _HfAdapter(Inference, ModelsProtocolPrivate):
|
class _HfAdapter(
|
||||||
|
Inference,
|
||||||
|
OpenAIChatCompletionUnsupportedMixin,
|
||||||
|
OpenAICompletionUnsupportedMixin,
|
||||||
|
ModelsProtocolPrivate,
|
||||||
|
):
|
||||||
client: AsyncInferenceClient
|
client: AsyncInferenceClient
|
||||||
max_tokens: int
|
max_tokens: int
|
||||||
model_id: str
|
model_id: str
|
||||||
|
|
|
@ -4,8 +4,9 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from typing import AsyncGenerator, List, Optional, Union
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI
|
||||||
from together import AsyncTogether
|
from together import AsyncTogether
|
||||||
|
|
||||||
from llama_stack.apis.common.content_types import (
|
from llama_stack.apis.common.content_types import (
|
||||||
|
@ -30,12 +31,14 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
convert_message_to_openai_dict,
|
convert_message_to_openai_dict,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
|
prepare_openai_completion_params,
|
||||||
process_chat_completion_response,
|
process_chat_completion_response,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
process_completion_response,
|
process_completion_response,
|
||||||
|
@ -60,6 +63,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
||||||
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
|
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
|
||||||
self.config = config
|
self.config = config
|
||||||
self._client = None
|
self._client = None
|
||||||
|
self._openai_client = None
|
||||||
|
|
||||||
async def initialize(self) -> None:
|
async def initialize(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
@ -110,6 +114,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
||||||
self._client = AsyncTogether(api_key=together_api_key)
|
self._client = AsyncTogether(api_key=together_api_key)
|
||||||
return self._client
|
return self._client
|
||||||
|
|
||||||
|
def _get_openai_client(self) -> AsyncOpenAI:
|
||||||
|
if not self._openai_client:
|
||||||
|
together_client = self._get_client().client
|
||||||
|
self._openai_client = AsyncOpenAI(
|
||||||
|
base_url=together_client.base_url,
|
||||||
|
api_key=together_client.api_key,
|
||||||
|
)
|
||||||
|
return self._openai_client
|
||||||
|
|
||||||
async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
|
async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
|
||||||
params = await self._get_params(request)
|
params = await self._get_params(request)
|
||||||
client = self._get_client()
|
client = self._get_client()
|
||||||
|
@ -243,3 +256,101 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
||||||
)
|
)
|
||||||
embeddings = [item.embedding for item in r.data]
|
embeddings = [item.embedding for item in r.data]
|
||||||
return EmbeddingsResponse(embeddings=embeddings)
|
return EmbeddingsResponse(embeddings=embeddings)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return await self._get_openai_client().completions.create(**params) # type: ignore
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
model_obj = await self.model_store.get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return await self._get_openai_client().chat.completions.create(**params) # type: ignore
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, AsyncGenerator, List, Optional, Union
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
|
@ -45,6 +45,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.apis.models import Model, ModelType
|
from llama_stack.apis.models import Model, ModelType
|
||||||
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
|
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
|
||||||
from llama_stack.models.llama.sku_list import all_registered_models
|
from llama_stack.models.llama.sku_list import all_registered_models
|
||||||
|
@ -58,6 +59,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
convert_message_to_openai_dict,
|
convert_message_to_openai_dict,
|
||||||
convert_tool_call,
|
convert_tool_call,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
|
prepare_openai_completion_params,
|
||||||
process_chat_completion_stream_response,
|
process_chat_completion_stream_response,
|
||||||
process_completion_response,
|
process_completion_response,
|
||||||
process_completion_stream_response,
|
process_completion_stream_response,
|
||||||
|
@ -418,3 +420,109 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
embeddings = [data.embedding for data in response.data]
|
embeddings = [data.embedding for data in response.data]
|
||||||
return EmbeddingsResponse(embeddings=embeddings)
|
return EmbeddingsResponse(embeddings=embeddings)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
|
||||||
|
extra_body: Dict[str, Any] = {}
|
||||||
|
if prompt_logprobs is not None and prompt_logprobs >= 0:
|
||||||
|
extra_body["prompt_logprobs"] = prompt_logprobs
|
||||||
|
if guided_choice:
|
||||||
|
extra_body["guided_choice"] = guided_choice
|
||||||
|
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
extra_body=extra_body,
|
||||||
|
)
|
||||||
|
return await self.client.completions.create(**params) # type: ignore
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return await self.client.chat.completions.create(**params) # type: ignore
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
|
from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ from llama_stack.apis.inference import (
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
ToolPromptFormat,
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
|
||||||
from llama_stack.apis.models.models import Model
|
from llama_stack.apis.models.models import Model
|
||||||
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
from llama_stack.distribution.request_headers import NeedsRequestProviderData
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
@ -40,6 +41,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
convert_openai_chat_completion_stream,
|
convert_openai_chat_completion_stream,
|
||||||
convert_tooldef_to_openai_tool,
|
convert_tooldef_to_openai_tool,
|
||||||
get_sampling_options,
|
get_sampling_options,
|
||||||
|
prepare_openai_completion_params,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.prompt_adapter import (
|
from llama_stack.providers.utils.inference.prompt_adapter import (
|
||||||
interleaved_content_as_str,
|
interleaved_content_as_str,
|
||||||
|
@ -245,3 +247,103 @@ class LiteLLMOpenAIMixin(
|
||||||
|
|
||||||
embeddings = [data["embedding"] for data in response["data"]]
|
embeddings = [data["embedding"] for data in response["data"]]
|
||||||
return EmbeddingsResponse(embeddings=embeddings)
|
return EmbeddingsResponse(embeddings=embeddings)
|
||||||
|
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
prompt=prompt,
|
||||||
|
best_of=best_of,
|
||||||
|
echo=echo,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
guided_choice=guided_choice,
|
||||||
|
prompt_logprobs=prompt_logprobs,
|
||||||
|
)
|
||||||
|
return litellm.text_completion(**params)
|
||||||
|
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIMessageParam],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
params = await prepare_openai_completion_params(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
messages=messages,
|
||||||
|
frequency_penalty=frequency_penalty,
|
||||||
|
function_call=function_call,
|
||||||
|
functions=functions,
|
||||||
|
logit_bias=logit_bias,
|
||||||
|
logprobs=logprobs,
|
||||||
|
max_completion_tokens=max_completion_tokens,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
n=n,
|
||||||
|
parallel_tool_calls=parallel_tool_calls,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
response_format=response_format,
|
||||||
|
seed=seed,
|
||||||
|
stop=stop,
|
||||||
|
stream=stream,
|
||||||
|
stream_options=stream_options,
|
||||||
|
temperature=temperature,
|
||||||
|
tool_choice=tool_choice,
|
||||||
|
tools=tools,
|
||||||
|
top_logprobs=top_logprobs,
|
||||||
|
top_p=top_p,
|
||||||
|
user=user,
|
||||||
|
)
|
||||||
|
return litellm.completion(**params)
|
||||||
|
|
|
@ -5,8 +5,10 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union
|
from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union
|
||||||
|
|
||||||
from openai import AsyncStream
|
from openai import AsyncStream
|
||||||
from openai.types.chat import (
|
from openai.types.chat import (
|
||||||
|
@ -83,6 +85,7 @@ from llama_stack.apis.inference import (
|
||||||
TopPSamplingStrategy,
|
TopPSamplingStrategy,
|
||||||
UserMessage,
|
UserMessage,
|
||||||
)
|
)
|
||||||
|
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice
|
||||||
from llama_stack.models.llama.datatypes import (
|
from llama_stack.models.llama.datatypes import (
|
||||||
BuiltinTool,
|
BuiltinTool,
|
||||||
StopReason,
|
StopReason,
|
||||||
|
@ -843,6 +846,31 @@ def _convert_openai_logprobs(
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_openai_sampling_params(
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
) -> SamplingParams:
|
||||||
|
sampling_params = SamplingParams()
|
||||||
|
|
||||||
|
if max_tokens:
|
||||||
|
sampling_params.max_tokens = max_tokens
|
||||||
|
|
||||||
|
# Map an explicit temperature of 0 to greedy sampling
|
||||||
|
if temperature == 0:
|
||||||
|
strategy = GreedySamplingStrategy()
|
||||||
|
else:
|
||||||
|
# OpenAI defaults to 1.0 for temperature and top_p if unset
|
||||||
|
if temperature is None:
|
||||||
|
temperature = 1.0
|
||||||
|
if top_p is None:
|
||||||
|
top_p = 1.0
|
||||||
|
strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
|
||||||
|
|
||||||
|
sampling_params.strategy = strategy
|
||||||
|
return sampling_params
|
||||||
|
|
||||||
|
|
||||||
def convert_openai_chat_completion_choice(
|
def convert_openai_chat_completion_choice(
|
||||||
choice: OpenAIChoice,
|
choice: OpenAIChoice,
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
|
@ -1049,3 +1077,106 @@ async def convert_openai_chat_completion_stream(
|
||||||
stop_reason=stop_reason,
|
stop_reason=stop_reason,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def prepare_openai_completion_params(**params):
|
||||||
|
completion_params = {k: v for k, v in params.items() if v is not None}
|
||||||
|
return completion_params
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAICompletionUnsupportedMixin:
|
||||||
|
async def openai_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
prompt: Union[str, List[str], List[int], List[List[int]]],
|
||||||
|
best_of: Optional[int] = None,
|
||||||
|
echo: Optional[bool] = None,
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
guided_choice: Optional[List[str]] = None,
|
||||||
|
prompt_logprobs: Optional[int] = None,
|
||||||
|
) -> OpenAICompletion:
|
||||||
|
if stream:
|
||||||
|
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
|
||||||
|
|
||||||
|
# This is a pretty hacky way to do emulate completions -
|
||||||
|
# basically just de-batches them...
|
||||||
|
prompts = [prompt] if not isinstance(prompt, list) else prompt
|
||||||
|
|
||||||
|
sampling_params = _convert_openai_sampling_params(
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
)
|
||||||
|
|
||||||
|
choices = []
|
||||||
|
# "n" is the number of completions to generate per prompt
|
||||||
|
for _i in range(0, n):
|
||||||
|
# and we may have multiple prompts, if batching was used
|
||||||
|
|
||||||
|
for prompt in prompts:
|
||||||
|
result = self.completion(
|
||||||
|
model_id=model,
|
||||||
|
content=prompt,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
index = len(choices)
|
||||||
|
text = result.content
|
||||||
|
finish_reason = _convert_openai_finish_reason(result.stop_reason)
|
||||||
|
|
||||||
|
choice = OpenAICompletionChoice(
|
||||||
|
index=index,
|
||||||
|
text=text,
|
||||||
|
finish_reason=finish_reason,
|
||||||
|
)
|
||||||
|
choices.append(choice)
|
||||||
|
|
||||||
|
return OpenAICompletion(
|
||||||
|
id=f"cmpl-{uuid.uuid4()}",
|
||||||
|
choices=choices,
|
||||||
|
created=int(time.time()),
|
||||||
|
model=model,
|
||||||
|
object="text_completion",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIChatCompletionUnsupportedMixin:
|
||||||
|
async def openai_chat_completion(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
messages: List[OpenAIChatCompletionMessage],
|
||||||
|
frequency_penalty: Optional[float] = None,
|
||||||
|
function_call: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
functions: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
logit_bias: Optional[Dict[str, float]] = None,
|
||||||
|
logprobs: Optional[bool] = None,
|
||||||
|
max_completion_tokens: Optional[int] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
n: Optional[int] = None,
|
||||||
|
parallel_tool_calls: Optional[bool] = None,
|
||||||
|
presence_penalty: Optional[float] = None,
|
||||||
|
response_format: Optional[Dict[str, str]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
|
stop: Optional[Union[str, List[str]]] = None,
|
||||||
|
stream: Optional[bool] = None,
|
||||||
|
stream_options: Optional[Dict[str, Any]] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
|
||||||
|
tools: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
top_logprobs: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
user: Optional[str] = None,
|
||||||
|
) -> OpenAIChatCompletion:
|
||||||
|
raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion")
|
||||||
|
|
|
@ -28,6 +28,7 @@ dependencies = [
|
||||||
"jinja2>=3.1.6",
|
"jinja2>=3.1.6",
|
||||||
"jsonschema",
|
"jsonschema",
|
||||||
"llama-stack-client>=0.2.1",
|
"llama-stack-client>=0.2.1",
|
||||||
|
"openai>=1.66",
|
||||||
"prompt-toolkit",
|
"prompt-toolkit",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"pydantic>=2",
|
"pydantic>=2",
|
||||||
|
|
|
@ -19,6 +19,7 @@ httpx==0.28.1
|
||||||
huggingface-hub==0.29.0
|
huggingface-hub==0.29.0
|
||||||
idna==3.10
|
idna==3.10
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
|
jiter==0.8.2
|
||||||
jsonschema==4.23.0
|
jsonschema==4.23.0
|
||||||
jsonschema-specifications==2024.10.1
|
jsonschema-specifications==2024.10.1
|
||||||
llama-stack-client==0.2.1
|
llama-stack-client==0.2.1
|
||||||
|
@ -27,6 +28,7 @@ markdown-it-py==3.0.0
|
||||||
markupsafe==3.0.2
|
markupsafe==3.0.2
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
numpy==2.2.3
|
numpy==2.2.3
|
||||||
|
openai==1.71.0
|
||||||
packaging==24.2
|
packaging==24.2
|
||||||
pandas==2.2.3
|
pandas==2.2.3
|
||||||
pillow==11.1.0
|
pillow==11.1.0
|
||||||
|
|
216
tests/integration/inference/test_openai_completion.py
Normal file
216
tests/integration/inference/test_openai_completion.py
Normal file
|
@ -0,0 +1,216 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||||
|
|
||||||
|
from ..test_cases.test_case import TestCase
|
||||||
|
|
||||||
|
|
||||||
|
def provider_from_model(client_with_models, model_id):
|
||||||
|
models = {m.identifier: m for m in client_with_models.models.list()}
|
||||||
|
models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
|
||||||
|
provider_id = models[model_id].provider_id
|
||||||
|
providers = {p.provider_id: p for p in client_with_models.providers.list()}
|
||||||
|
return providers[provider_id]
|
||||||
|
|
||||||
|
|
||||||
|
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
|
||||||
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("OpenAI completions are not supported when testing with library client yet.")
|
||||||
|
|
||||||
|
provider = provider_from_model(client_with_models, model_id)
|
||||||
|
if provider.provider_type in (
|
||||||
|
"inline::meta-reference",
|
||||||
|
"inline::sentence-transformers",
|
||||||
|
"inline::vllm",
|
||||||
|
"remote::bedrock",
|
||||||
|
"remote::cerebras",
|
||||||
|
"remote::databricks",
|
||||||
|
# Technically Nvidia does support OpenAI completions, but none of their hosted models
|
||||||
|
# support both completions and chat completions endpoint and all the Llama models are
|
||||||
|
# just chat completions
|
||||||
|
"remote::nvidia",
|
||||||
|
"remote::runpod",
|
||||||
|
"remote::sambanova",
|
||||||
|
"remote::tgi",
|
||||||
|
):
|
||||||
|
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
|
||||||
|
|
||||||
|
|
||||||
|
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
|
||||||
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
|
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
|
||||||
|
|
||||||
|
provider = provider_from_model(client_with_models, model_id)
|
||||||
|
if provider.provider_type in (
|
||||||
|
"inline::meta-reference",
|
||||||
|
"inline::sentence-transformers",
|
||||||
|
"inline::vllm",
|
||||||
|
"remote::bedrock",
|
||||||
|
"remote::cerebras",
|
||||||
|
"remote::databricks",
|
||||||
|
"remote::runpod",
|
||||||
|
"remote::sambanova",
|
||||||
|
"remote::tgi",
|
||||||
|
):
|
||||||
|
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
|
||||||
|
|
||||||
|
|
||||||
|
def skip_if_provider_isnt_vllm(client_with_models, model_id):
|
||||||
|
provider = provider_from_model(client_with_models, model_id)
|
||||||
|
if provider.provider_type != "remote::vllm":
|
||||||
|
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def openai_client(client_with_models):
|
||||||
|
base_url = f"{client_with_models.base_url}/v1/openai/v1"
|
||||||
|
return OpenAI(base_url=base_url, api_key="bar")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:completion:sanity",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
|
||||||
|
# ollama needs more verbose prompting for some reason here...
|
||||||
|
prompt = "Respond to this question and explain your answer. " + tc["content"]
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
prompt=prompt,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
assert len(response.choices) > 0
|
||||||
|
choice = response.choices[0]
|
||||||
|
assert len(choice.text) > 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:completion:sanity",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
|
||||||
|
# ollama needs more verbose prompting for some reason here...
|
||||||
|
prompt = "Respond to this question and explain your answer. " + tc["content"]
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
prompt=prompt,
|
||||||
|
stream=True,
|
||||||
|
max_tokens=50,
|
||||||
|
)
|
||||||
|
streamed_content = [chunk.choices[0].text for chunk in response]
|
||||||
|
content_str = "".join(streamed_content).lower().strip()
|
||||||
|
assert len(content_str) > 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"prompt_logprobs",
|
||||||
|
[
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
|
||||||
|
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
|
||||||
|
|
||||||
|
prompt = "Hello, world!"
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
prompt=prompt,
|
||||||
|
stream=False,
|
||||||
|
extra_body={
|
||||||
|
"prompt_logprobs": prompt_logprobs,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(response.choices) > 0
|
||||||
|
choice = response.choices[0]
|
||||||
|
assert len(choice.prompt_logprobs) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
|
||||||
|
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
|
||||||
|
|
||||||
|
prompt = "I am feeling really sad today."
|
||||||
|
response = openai_client.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
prompt=prompt,
|
||||||
|
stream=False,
|
||||||
|
extra_body={
|
||||||
|
"guided_choice": ["joy", "sadness"],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(response.choices) > 0
|
||||||
|
choice = response.choices[0]
|
||||||
|
assert choice.text in ["joy", "sadness"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:chat_completion:non_streaming_01",
|
||||||
|
"inference:chat_completion:non_streaming_02",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
question = tc["question"]
|
||||||
|
expected = tc["expected"]
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": question,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
message_content = response.choices[0].message.content.lower().strip()
|
||||||
|
assert len(message_content) > 0
|
||||||
|
assert expected.lower() in message_content
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:chat_completion:streaming_01",
|
||||||
|
"inference:chat_completion:streaming_02",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
question = tc["question"]
|
||||||
|
expected = tc["expected"]
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
messages=[{"role": "user", "content": question}],
|
||||||
|
stream=True,
|
||||||
|
timeout=120, # Increase timeout to 2 minutes for large conversation history
|
||||||
|
)
|
||||||
|
streamed_content = []
|
||||||
|
for chunk in response:
|
||||||
|
if chunk.choices[0].delta.content:
|
||||||
|
streamed_content.append(chunk.choices[0].delta.content.lower().strip())
|
||||||
|
assert len(streamed_content) > 0
|
||||||
|
assert expected.lower() in "".join(streamed_content)
|
8
uv.lock
generated
8
uv.lock
generated
|
@ -1384,6 +1384,7 @@ dependencies = [
|
||||||
{ name = "jinja2" },
|
{ name = "jinja2" },
|
||||||
{ name = "jsonschema" },
|
{ name = "jsonschema" },
|
||||||
{ name = "llama-stack-client" },
|
{ name = "llama-stack-client" },
|
||||||
|
{ name = "openai" },
|
||||||
{ name = "pillow" },
|
{ name = "pillow" },
|
||||||
{ name = "prompt-toolkit" },
|
{ name = "prompt-toolkit" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
|
@ -1485,6 +1486,7 @@ requires-dist = [
|
||||||
{ name = "mcp", marker = "extra == 'test'" },
|
{ name = "mcp", marker = "extra == 'test'" },
|
||||||
{ name = "myst-parser", marker = "extra == 'docs'" },
|
{ name = "myst-parser", marker = "extra == 'docs'" },
|
||||||
{ name = "nbval", marker = "extra == 'dev'" },
|
{ name = "nbval", marker = "extra == 'dev'" },
|
||||||
|
{ name = "openai", specifier = ">=1.66" },
|
||||||
{ name = "openai", marker = "extra == 'test'" },
|
{ name = "openai", marker = "extra == 'test'" },
|
||||||
{ name = "openai", marker = "extra == 'unit'" },
|
{ name = "openai", marker = "extra == 'unit'" },
|
||||||
{ name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
|
{ name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
|
||||||
|
@ -2016,7 +2018,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "openai"
|
name = "openai"
|
||||||
version = "1.63.2"
|
version = "1.71.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "anyio" },
|
{ name = "anyio" },
|
||||||
|
@ -2028,9 +2030,9 @@ dependencies = [
|
||||||
{ name = "tqdm" },
|
{ name = "tqdm" },
|
||||||
{ name = "typing-extensions" },
|
{ name = "typing-extensions" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 }
|
sdist = { url = "https://files.pythonhosted.org/packages/d9/19/b8f0347090a649dce55a008ec54ac6abb50553a06508cdb5e7abb2813e99/openai-1.71.0.tar.gz", hash = "sha256:52b20bb990a1780f9b0b8ccebac93416343ebd3e4e714e3eff730336833ca207", size = 409926 }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 },
|
{ url = "https://files.pythonhosted.org/packages/c4/f7/049e85faf6a000890e5ca0edca8e9183f8a43c9e7bba869cad871da0caba/openai-1.71.0-py3-none-any.whl", hash = "sha256:e1c643738f1fff1af52bce6ef06a7716c95d089281e7011777179614f32937aa", size = 598975 },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue