feat: OpenAI-Compatible models, completions, chat/completions (#1894)

# What does this PR do?

This stubs in some OpenAI server-side compatibility with three new
endpoints:

/v1/openai/v1/models
/v1/openai/v1/completions
/v1/openai/v1/chat/completions

This gives common inference apps using OpenAI clients the ability to
talk to Llama Stack using an endpoint like
http://localhost:8321/v1/openai/v1 .

The two "v1" instances in there isn't awesome, but the thinking is that
Llama Stack's API is v1 and then our OpenAI compatibility layer is
compatible with OpenAI V1. And, some OpenAI clients implicitly assume
the URL ends with "v1", so this gives maximum compatibility.

The openai models endpoint is implemented in the routing layer, and just
returns all the models Llama Stack knows about.

The following providers should be working with the new OpenAI
completions and chat/completions API:
* remote::anthropic (untested)
* remote::cerebras-openai-compat (untested)
* remote::fireworks (tested)
* remote::fireworks-openai-compat (untested)
* remote::gemini (untested)
* remote::groq-openai-compat (untested)
* remote::nvidia (tested)
* remote::ollama (tested)
* remote::openai (untested)
* remote::passthrough (untested)
* remote::sambanova-openai-compat (untested)
* remote::together (tested)
* remote::together-openai-compat (untested)
* remote::vllm (tested)

The goal to support this for every inference provider - proxying
directly to the provider's OpenAI endpoint for OpenAI-compatible
providers. For providers that don't have an OpenAI-compatible API, we'll
add a mixin to translate incoming OpenAI requests to Llama Stack
inference requests and translate the Llama Stack inference responses to
OpenAI responses.

This is related to #1817 but is a bit larger in scope than just chat
completions, as I have real use-cases that need the older completions
API as well.

## Test Plan

### vLLM

```
VLLM_URL="http://localhost:8000/v1" INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" llama stack build --template remote-vllm --image-type venv --run

LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "meta-llama/Llama-3.2-3B-Instruct"
```

### ollama
```
INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" llama stack build --template ollama --image-type venv --run

LLAMA_STACK_CONFIG=http://localhost:8321 INFERENCE_MODEL="llama3.2:3b-instruct-q8_0" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-q8_0"
```



## Documentation

Run a Llama Stack distribution that uses one of the providers mentioned
in the list above. Then, use your favorite OpenAI client to send
completion or chat completion requests with the base_url set to
http://localhost:8321/v1/openai/v1 . Replace "localhost:8321" with the
host and port of your Llama Stack server, if different.

---------

Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
Ben Browning 2025-04-11 16:14:17 -04:00 committed by GitHub
parent 24d70cedca
commit 2b2db5fbda
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 3265 additions and 20 deletions

View file

@ -3092,6 +3092,125 @@
}
}
},
"/v1/openai/v1/chat/completions": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenAIChatCompletion"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenaiChatCompletionRequest"
}
}
},
"required": true
}
}
},
"/v1/openai/v1/completions": {
"post": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenAICompletion"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenaiCompletionRequest"
}
}
},
"required": true
}
}
},
"/v1/openai/v1/models": {
"get": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/OpenAIListModelsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Models"
],
"description": "",
"parameters": []
}
},
"/v1/post-training/preference-optimize": {
"post": {
"responses": {
@ -8713,6 +8832,819 @@
],
"title": "LogEventRequest"
},
"OpenAIAssistantMessageParam": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "assistant",
"default": "assistant",
"description": "Must be \"assistant\" to identify this as the model's response"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the model's response"
},
"name": {
"type": "string",
"description": "(Optional) The name of the assistant message participant."
},
"tool_calls": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolCall"
},
"description": "List of tool calls. Each tool call is a ToolCall object."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
],
"title": "OpenAIAssistantMessageParam",
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
},
"OpenAIDeveloperMessageParam": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "developer",
"default": "developer",
"description": "Must be \"developer\" to identify this as a developer message"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the developer message"
},
"name": {
"type": "string",
"description": "(Optional) The name of the developer message participant."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
],
"title": "OpenAIDeveloperMessageParam",
"description": "A message from the developer in an OpenAI-compatible chat completion request."
},
"OpenAIMessageParam": {
"oneOf": [
{
"$ref": "#/components/schemas/OpenAIUserMessageParam"
},
{
"$ref": "#/components/schemas/OpenAISystemMessageParam"
},
{
"$ref": "#/components/schemas/OpenAIAssistantMessageParam"
},
{
"$ref": "#/components/schemas/OpenAIToolMessageParam"
},
{
"$ref": "#/components/schemas/OpenAIDeveloperMessageParam"
}
],
"discriminator": {
"propertyName": "role",
"mapping": {
"user": "#/components/schemas/OpenAIUserMessageParam",
"system": "#/components/schemas/OpenAISystemMessageParam",
"assistant": "#/components/schemas/OpenAIAssistantMessageParam",
"tool": "#/components/schemas/OpenAIToolMessageParam",
"developer": "#/components/schemas/OpenAIDeveloperMessageParam"
}
}
},
"OpenAISystemMessageParam": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "system",
"default": "system",
"description": "Must be \"system\" to identify this as a system message"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
},
"name": {
"type": "string",
"description": "(Optional) The name of the system message participant."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
],
"title": "OpenAISystemMessageParam",
"description": "A system message providing instructions or context to the model."
},
"OpenAIToolMessageParam": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "tool",
"default": "tool",
"description": "Must be \"tool\" to identify this as a tool response"
},
"tool_call_id": {
"type": "string",
"description": "Unique identifier for the tool call this response is for"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The response content from the tool"
}
},
"additionalProperties": false,
"required": [
"role",
"tool_call_id",
"content"
],
"title": "OpenAIToolMessageParam",
"description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request."
},
"OpenAIUserMessageParam": {
"type": "object",
"properties": {
"role": {
"type": "string",
"const": "user",
"default": "user",
"description": "Must be \"user\" to identify this as a user message"
},
"content": {
"$ref": "#/components/schemas/InterleavedContent",
"description": "The content of the message, which can include text and other media"
},
"name": {
"type": "string",
"description": "(Optional) The name of the user message participant."
}
},
"additionalProperties": false,
"required": [
"role",
"content"
],
"title": "OpenAIUserMessageParam",
"description": "A message from the user in an OpenAI-compatible chat completion request."
},
"OpenaiChatCompletionRequest": {
"type": "object",
"properties": {
"model": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAIMessageParam"
},
"description": "List of messages in the conversation"
},
"frequency_penalty": {
"type": "number",
"description": "(Optional) The penalty for repeated tokens"
},
"function_call": {
"oneOf": [
{
"type": "string"
},
{
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
],
"description": "(Optional) The function call to use"
},
"functions": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "(Optional) List of functions to use"
},
"logit_bias": {
"type": "object",
"additionalProperties": {
"type": "number"
},
"description": "(Optional) The logit bias to use"
},
"logprobs": {
"type": "boolean",
"description": "(Optional) The log probabilities to use"
},
"max_completion_tokens": {
"type": "integer",
"description": "(Optional) The maximum number of tokens to generate"
},
"max_tokens": {
"type": "integer",
"description": "(Optional) The maximum number of tokens to generate"
},
"n": {
"type": "integer",
"description": "(Optional) The number of completions to generate"
},
"parallel_tool_calls": {
"type": "boolean",
"description": "(Optional) Whether to parallelize tool calls"
},
"presence_penalty": {
"type": "number",
"description": "(Optional) The penalty for repeated tokens"
},
"response_format": {
"type": "object",
"additionalProperties": {
"type": "string"
},
"description": "(Optional) The response format to use"
},
"seed": {
"type": "integer",
"description": "(Optional) The seed to use"
},
"stop": {
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
],
"description": "(Optional) The stop tokens to use"
},
"stream": {
"type": "boolean",
"description": "(Optional) Whether to stream the response"
},
"stream_options": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "(Optional) The stream options to use"
},
"temperature": {
"type": "number",
"description": "(Optional) The temperature to use"
},
"tool_choice": {
"oneOf": [
{
"type": "string"
},
{
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
}
],
"description": "(Optional) The tool choice to use"
},
"tools": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
}
},
"description": "(Optional) The tools to use"
},
"top_logprobs": {
"type": "integer",
"description": "(Optional) The top log probabilities to use"
},
"top_p": {
"type": "number",
"description": "(Optional) The top p to use"
},
"user": {
"type": "string",
"description": "(Optional) The user to use"
}
},
"additionalProperties": false,
"required": [
"model",
"messages"
],
"title": "OpenaiChatCompletionRequest"
},
"OpenAIChatCompletion": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The ID of the chat completion"
},
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAIChoice"
},
"description": "List of choices"
},
"object": {
"type": "string",
"const": "chat.completion",
"default": "chat.completion",
"description": "The object type, which will be \"chat.completion\""
},
"created": {
"type": "integer",
"description": "The Unix timestamp in seconds when the chat completion was created"
},
"model": {
"type": "string",
"description": "The model that was used to generate the chat completion"
}
},
"additionalProperties": false,
"required": [
"id",
"choices",
"object",
"created",
"model"
],
"title": "OpenAIChatCompletion",
"description": "Response from an OpenAI-compatible chat completion request."
},
"OpenAIChoice": {
"type": "object",
"properties": {
"message": {
"$ref": "#/components/schemas/OpenAIMessageParam",
"description": "The message from the model"
},
"finish_reason": {
"type": "string",
"description": "The reason the model stopped generating"
},
"index": {
"type": "integer"
},
"logprobs": {
"$ref": "#/components/schemas/OpenAIChoiceLogprobs"
}
},
"additionalProperties": false,
"required": [
"message",
"finish_reason",
"index"
],
"title": "OpenAIChoice",
"description": "A choice from an OpenAI-compatible chat completion response."
},
"OpenAIChoiceLogprobs": {
"type": "object",
"properties": {
"content": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAITokenLogProb"
}
},
"refusal": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAITokenLogProb"
}
}
},
"additionalProperties": false,
"title": "OpenAIChoiceLogprobs",
"description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response."
},
"OpenAITokenLogProb": {
"type": "object",
"properties": {
"token": {
"type": "string"
},
"bytes": {
"type": "array",
"items": {
"type": "integer"
}
},
"logprob": {
"type": "number"
},
"top_logprobs": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAITopLogProb"
}
}
},
"additionalProperties": false,
"required": [
"token",
"logprob",
"top_logprobs"
],
"title": "OpenAITokenLogProb",
"description": "The log probability for a token from an OpenAI-compatible chat completion response."
},
"OpenAITopLogProb": {
"type": "object",
"properties": {
"token": {
"type": "string"
},
"bytes": {
"type": "array",
"items": {
"type": "integer"
}
},
"logprob": {
"type": "number"
}
},
"additionalProperties": false,
"required": [
"token",
"logprob"
],
"title": "OpenAITopLogProb",
"description": "The top log probability for a token from an OpenAI-compatible chat completion response."
},
"OpenaiCompletionRequest": {
"type": "object",
"properties": {
"model": {
"type": "string",
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
},
"prompt": {
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
},
{
"type": "array",
"items": {
"type": "integer"
}
},
{
"type": "array",
"items": {
"type": "array",
"items": {
"type": "integer"
}
}
}
],
"description": "The prompt to generate a completion for"
},
"best_of": {
"type": "integer",
"description": "(Optional) The number of completions to generate"
},
"echo": {
"type": "boolean",
"description": "(Optional) Whether to echo the prompt"
},
"frequency_penalty": {
"type": "number",
"description": "(Optional) The penalty for repeated tokens"
},
"logit_bias": {
"type": "object",
"additionalProperties": {
"type": "number"
},
"description": "(Optional) The logit bias to use"
},
"logprobs": {
"type": "boolean",
"description": "(Optional) The log probabilities to use"
},
"max_tokens": {
"type": "integer",
"description": "(Optional) The maximum number of tokens to generate"
},
"n": {
"type": "integer",
"description": "(Optional) The number of completions to generate"
},
"presence_penalty": {
"type": "number",
"description": "(Optional) The penalty for repeated tokens"
},
"seed": {
"type": "integer",
"description": "(Optional) The seed to use"
},
"stop": {
"oneOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "string"
}
}
],
"description": "(Optional) The stop tokens to use"
},
"stream": {
"type": "boolean",
"description": "(Optional) Whether to stream the response"
},
"stream_options": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "(Optional) The stream options to use"
},
"temperature": {
"type": "number",
"description": "(Optional) The temperature to use"
},
"top_p": {
"type": "number",
"description": "(Optional) The top p to use"
},
"user": {
"type": "string",
"description": "(Optional) The user to use"
},
"guided_choice": {
"type": "array",
"items": {
"type": "string"
}
},
"prompt_logprobs": {
"type": "integer"
}
},
"additionalProperties": false,
"required": [
"model",
"prompt"
],
"title": "OpenaiCompletionRequest"
},
"OpenAICompletion": {
"type": "object",
"properties": {
"id": {
"type": "string"
},
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAICompletionChoice"
}
},
"created": {
"type": "integer"
},
"model": {
"type": "string"
},
"object": {
"type": "string",
"const": "text_completion",
"default": "text_completion"
}
},
"additionalProperties": false,
"required": [
"id",
"choices",
"created",
"model",
"object"
],
"title": "OpenAICompletion",
"description": "Response from an OpenAI-compatible completion request."
},
"OpenAICompletionChoice": {
"type": "object",
"properties": {
"finish_reason": {
"type": "string"
},
"text": {
"type": "string"
},
"index": {
"type": "integer"
},
"logprobs": {
"$ref": "#/components/schemas/OpenAIChoiceLogprobs"
}
},
"additionalProperties": false,
"required": [
"finish_reason",
"text",
"index"
],
"title": "OpenAICompletionChoice",
"description": "A choice from an OpenAI-compatible completion response."
},
"OpenAIModel": {
"type": "object",
"properties": {
"id": {
"type": "string"
},
"object": {
"type": "string",
"const": "model",
"default": "model"
},
"created": {
"type": "integer"
},
"owned_by": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"id",
"object",
"created",
"owned_by"
],
"title": "OpenAIModel",
"description": "A model from OpenAI."
},
"OpenAIListModelsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/OpenAIModel"
}
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "OpenAIListModelsResponse"
},
"DPOAlignmentConfig": {
"type": "object",
"properties": {

View file

@ -2131,6 +2131,91 @@ paths:
schema:
$ref: '#/components/schemas/LogEventRequest'
required: true
/v1/openai/v1/chat/completions:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/OpenAIChatCompletion'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
description: >-
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/OpenaiChatCompletionRequest'
required: true
/v1/openai/v1/completions:
post:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/OpenAICompletion'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
description: >-
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/OpenaiCompletionRequest'
required: true
/v1/openai/v1/models:
get:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/OpenAIListModelsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Models
description: ''
parameters: []
/v1/post-training/preference-optimize:
post:
responses:
@ -5980,6 +6065,586 @@ components:
- event
- ttl_seconds
title: LogEventRequest
OpenAIAssistantMessageParam:
type: object
properties:
role:
type: string
const: assistant
default: assistant
description: >-
Must be "assistant" to identify this as the model's response
content:
$ref: '#/components/schemas/InterleavedContent'
description: The content of the model's response
name:
type: string
description: >-
(Optional) The name of the assistant message participant.
tool_calls:
type: array
items:
$ref: '#/components/schemas/ToolCall'
description: >-
List of tool calls. Each tool call is a ToolCall object.
additionalProperties: false
required:
- role
- content
title: OpenAIAssistantMessageParam
description: >-
A message containing the model's (assistant) response in an OpenAI-compatible
chat completion request.
OpenAIDeveloperMessageParam:
type: object
properties:
role:
type: string
const: developer
default: developer
description: >-
Must be "developer" to identify this as a developer message
content:
$ref: '#/components/schemas/InterleavedContent'
description: The content of the developer message
name:
type: string
description: >-
(Optional) The name of the developer message participant.
additionalProperties: false
required:
- role
- content
title: OpenAIDeveloperMessageParam
description: >-
A message from the developer in an OpenAI-compatible chat completion request.
OpenAIMessageParam:
oneOf:
- $ref: '#/components/schemas/OpenAIUserMessageParam'
- $ref: '#/components/schemas/OpenAISystemMessageParam'
- $ref: '#/components/schemas/OpenAIAssistantMessageParam'
- $ref: '#/components/schemas/OpenAIToolMessageParam'
- $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
discriminator:
propertyName: role
mapping:
user: '#/components/schemas/OpenAIUserMessageParam'
system: '#/components/schemas/OpenAISystemMessageParam'
assistant: '#/components/schemas/OpenAIAssistantMessageParam'
tool: '#/components/schemas/OpenAIToolMessageParam'
developer: '#/components/schemas/OpenAIDeveloperMessageParam'
OpenAISystemMessageParam:
type: object
properties:
role:
type: string
const: system
default: system
description: >-
Must be "system" to identify this as a system message
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the "system prompt". If multiple system messages are provided,
they are concatenated. The underlying Llama Stack code may also add other
system messages (for example, for formatting tool definitions).
name:
type: string
description: >-
(Optional) The name of the system message participant.
additionalProperties: false
required:
- role
- content
title: OpenAISystemMessageParam
description: >-
A system message providing instructions or context to the model.
OpenAIToolMessageParam:
type: object
properties:
role:
type: string
const: tool
default: tool
description: >-
Must be "tool" to identify this as a tool response
tool_call_id:
type: string
description: >-
Unique identifier for the tool call this response is for
content:
$ref: '#/components/schemas/InterleavedContent'
description: The response content from the tool
additionalProperties: false
required:
- role
- tool_call_id
- content
title: OpenAIToolMessageParam
description: >-
A message representing the result of a tool invocation in an OpenAI-compatible
chat completion request.
OpenAIUserMessageParam:
type: object
properties:
role:
type: string
const: user
default: user
description: >-
Must be "user" to identify this as a user message
content:
$ref: '#/components/schemas/InterleavedContent'
description: >-
The content of the message, which can include text and other media
name:
type: string
description: >-
(Optional) The name of the user message participant.
additionalProperties: false
required:
- role
- content
title: OpenAIUserMessageParam
description: >-
A message from the user in an OpenAI-compatible chat completion request.
OpenaiChatCompletionRequest:
type: object
properties:
model:
type: string
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
messages:
type: array
items:
$ref: '#/components/schemas/OpenAIMessageParam'
description: List of messages in the conversation
frequency_penalty:
type: number
description: >-
(Optional) The penalty for repeated tokens
function_call:
oneOf:
- type: string
- type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) The function call to use
functions:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) List of functions to use
logit_bias:
type: object
additionalProperties:
type: number
description: (Optional) The logit bias to use
logprobs:
type: boolean
description: (Optional) The log probabilities to use
max_completion_tokens:
type: integer
description: >-
(Optional) The maximum number of tokens to generate
max_tokens:
type: integer
description: >-
(Optional) The maximum number of tokens to generate
n:
type: integer
description: >-
(Optional) The number of completions to generate
parallel_tool_calls:
type: boolean
description: >-
(Optional) Whether to parallelize tool calls
presence_penalty:
type: number
description: >-
(Optional) The penalty for repeated tokens
response_format:
type: object
additionalProperties:
type: string
description: (Optional) The response format to use
seed:
type: integer
description: (Optional) The seed to use
stop:
oneOf:
- type: string
- type: array
items:
type: string
description: (Optional) The stop tokens to use
stream:
type: boolean
description: >-
(Optional) Whether to stream the response
stream_options:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) The stream options to use
temperature:
type: number
description: (Optional) The temperature to use
tool_choice:
oneOf:
- type: string
- type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) The tool choice to use
tools:
type: array
items:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) The tools to use
top_logprobs:
type: integer
description: >-
(Optional) The top log probabilities to use
top_p:
type: number
description: (Optional) The top p to use
user:
type: string
description: (Optional) The user to use
additionalProperties: false
required:
- model
- messages
title: OpenaiChatCompletionRequest
OpenAIChatCompletion:
type: object
properties:
id:
type: string
description: The ID of the chat completion
choices:
type: array
items:
$ref: '#/components/schemas/OpenAIChoice'
description: List of choices
object:
type: string
const: chat.completion
default: chat.completion
description: >-
The object type, which will be "chat.completion"
created:
type: integer
description: >-
The Unix timestamp in seconds when the chat completion was created
model:
type: string
description: >-
The model that was used to generate the chat completion
additionalProperties: false
required:
- id
- choices
- object
- created
- model
title: OpenAIChatCompletion
description: >-
Response from an OpenAI-compatible chat completion request.
OpenAIChoice:
type: object
properties:
message:
$ref: '#/components/schemas/OpenAIMessageParam'
description: The message from the model
finish_reason:
type: string
description: The reason the model stopped generating
index:
type: integer
logprobs:
$ref: '#/components/schemas/OpenAIChoiceLogprobs'
additionalProperties: false
required:
- message
- finish_reason
- index
title: OpenAIChoice
description: >-
A choice from an OpenAI-compatible chat completion response.
OpenAIChoiceLogprobs:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/OpenAITokenLogProb'
refusal:
type: array
items:
$ref: '#/components/schemas/OpenAITokenLogProb'
additionalProperties: false
title: OpenAIChoiceLogprobs
description: >-
The log probabilities for the tokens in the message from an OpenAI-compatible
chat completion response.
OpenAITokenLogProb:
type: object
properties:
token:
type: string
bytes:
type: array
items:
type: integer
logprob:
type: number
top_logprobs:
type: array
items:
$ref: '#/components/schemas/OpenAITopLogProb'
additionalProperties: false
required:
- token
- logprob
- top_logprobs
title: OpenAITokenLogProb
description: >-
The log probability for a token from an OpenAI-compatible chat completion
response.
OpenAITopLogProb:
type: object
properties:
token:
type: string
bytes:
type: array
items:
type: integer
logprob:
type: number
additionalProperties: false
required:
- token
- logprob
title: OpenAITopLogProb
description: >-
The top log probability for a token from an OpenAI-compatible chat completion
response.
OpenaiCompletionRequest:
type: object
properties:
model:
type: string
description: >-
The identifier of the model to use. The model must be registered with
Llama Stack and available via the /models endpoint.
prompt:
oneOf:
- type: string
- type: array
items:
type: string
- type: array
items:
type: integer
- type: array
items:
type: array
items:
type: integer
description: The prompt to generate a completion for
best_of:
type: integer
description: >-
(Optional) The number of completions to generate
echo:
type: boolean
description: (Optional) Whether to echo the prompt
frequency_penalty:
type: number
description: >-
(Optional) The penalty for repeated tokens
logit_bias:
type: object
additionalProperties:
type: number
description: (Optional) The logit bias to use
logprobs:
type: boolean
description: (Optional) The log probabilities to use
max_tokens:
type: integer
description: >-
(Optional) The maximum number of tokens to generate
n:
type: integer
description: >-
(Optional) The number of completions to generate
presence_penalty:
type: number
description: >-
(Optional) The penalty for repeated tokens
seed:
type: integer
description: (Optional) The seed to use
stop:
oneOf:
- type: string
- type: array
items:
type: string
description: (Optional) The stop tokens to use
stream:
type: boolean
description: >-
(Optional) Whether to stream the response
stream_options:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: (Optional) The stream options to use
temperature:
type: number
description: (Optional) The temperature to use
top_p:
type: number
description: (Optional) The top p to use
user:
type: string
description: (Optional) The user to use
guided_choice:
type: array
items:
type: string
prompt_logprobs:
type: integer
additionalProperties: false
required:
- model
- prompt
title: OpenaiCompletionRequest
OpenAICompletion:
type: object
properties:
id:
type: string
choices:
type: array
items:
$ref: '#/components/schemas/OpenAICompletionChoice'
created:
type: integer
model:
type: string
object:
type: string
const: text_completion
default: text_completion
additionalProperties: false
required:
- id
- choices
- created
- model
- object
title: OpenAICompletion
description: >-
Response from an OpenAI-compatible completion request.
OpenAICompletionChoice:
type: object
properties:
finish_reason:
type: string
text:
type: string
index:
type: integer
logprobs:
$ref: '#/components/schemas/OpenAIChoiceLogprobs'
additionalProperties: false
required:
- finish_reason
- text
- index
title: OpenAICompletionChoice
description: >-
A choice from an OpenAI-compatible completion response.
OpenAIModel:
type: object
properties:
id:
type: string
object:
type: string
const: model
default: model
created:
type: integer
owned_by:
type: string
additionalProperties: false
required:
- id
- object
- created
- owned_by
title: OpenAIModel
description: A model from OpenAI.
OpenAIListModelsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/OpenAIModel'
additionalProperties: false
required:
- data
title: OpenAIListModelsResponse
DPOAlignmentConfig:
type: object
properties:

View file

@ -442,6 +442,217 @@ class EmbeddingsResponse(BaseModel):
embeddings: List[List[float]]
@json_schema_type
class OpenAIUserMessageParam(BaseModel):
"""A message from the user in an OpenAI-compatible chat completion request.
:param role: Must be "user" to identify this as a user message
:param content: The content of the message, which can include text and other media
:param name: (Optional) The name of the user message participant.
"""
role: Literal["user"] = "user"
content: InterleavedContent
name: Optional[str] = None
@json_schema_type
class OpenAISystemMessageParam(BaseModel):
"""A system message providing instructions or context to the model.
:param role: Must be "system" to identify this as a system message
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
:param name: (Optional) The name of the system message participant.
"""
role: Literal["system"] = "system"
content: InterleavedContent
name: Optional[str] = None
@json_schema_type
class OpenAIAssistantMessageParam(BaseModel):
"""A message containing the model's (assistant) response in an OpenAI-compatible chat completion request.
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param name: (Optional) The name of the assistant message participant.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""
role: Literal["assistant"] = "assistant"
content: InterleavedContent
name: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
@json_schema_type
class OpenAIToolMessageParam(BaseModel):
"""A message representing the result of a tool invocation in an OpenAI-compatible chat completion request.
:param role: Must be "tool" to identify this as a tool response
:param tool_call_id: Unique identifier for the tool call this response is for
:param content: The response content from the tool
"""
role: Literal["tool"] = "tool"
tool_call_id: str
content: InterleavedContent
@json_schema_type
class OpenAIDeveloperMessageParam(BaseModel):
"""A message from the developer in an OpenAI-compatible chat completion request.
:param role: Must be "developer" to identify this as a developer message
:param content: The content of the developer message
:param name: (Optional) The name of the developer message participant.
"""
role: Literal["developer"] = "developer"
content: InterleavedContent
name: Optional[str] = None
OpenAIMessageParam = Annotated[
Union[
OpenAIUserMessageParam,
OpenAISystemMessageParam,
OpenAIAssistantMessageParam,
OpenAIToolMessageParam,
OpenAIDeveloperMessageParam,
],
Field(discriminator="role"),
]
register_schema(OpenAIMessageParam, name="OpenAIMessageParam")
@json_schema_type
class OpenAITopLogProb(BaseModel):
"""The top log probability for a token from an OpenAI-compatible chat completion response.
:token: The token
:bytes: (Optional) The bytes for the token
:logprob: The log probability of the token
"""
token: str
bytes: Optional[List[int]] = None
logprob: float
@json_schema_type
class OpenAITokenLogProb(BaseModel):
"""The log probability for a token from an OpenAI-compatible chat completion response.
:token: The token
:bytes: (Optional) The bytes for the token
:logprob: The log probability of the token
:top_logprobs: The top log probabilities for the token
"""
token: str
bytes: Optional[List[int]] = None
logprob: float
top_logprobs: List[OpenAITopLogProb]
@json_schema_type
class OpenAIChoiceLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response.
:content: (Optional) The log probabilities for the tokens in the message
:refusal: (Optional) The log probabilities for the tokens in the message
"""
content: Optional[List[OpenAITokenLogProb]] = None
refusal: Optional[List[OpenAITokenLogProb]] = None
@json_schema_type
class OpenAIChoice(BaseModel):
"""A choice from an OpenAI-compatible chat completion response.
:param message: The message from the model
:param finish_reason: The reason the model stopped generating
:index: The index of the choice
:logprobs: (Optional) The log probabilities for the tokens in the message
"""
message: OpenAIMessageParam
finish_reason: str
index: int
logprobs: Optional[OpenAIChoiceLogprobs] = None
@json_schema_type
class OpenAIChatCompletion(BaseModel):
"""Response from an OpenAI-compatible chat completion request.
:param id: The ID of the chat completion
:param choices: List of choices
:param object: The object type, which will be "chat.completion"
:param created: The Unix timestamp in seconds when the chat completion was created
:param model: The model that was used to generate the chat completion
"""
id: str
choices: List[OpenAIChoice]
object: Literal["chat.completion"] = "chat.completion"
created: int
model: str
@json_schema_type
class OpenAICompletionLogprobs(BaseModel):
"""The log probabilities for the tokens in the message from an OpenAI-compatible completion response.
:text_offset: (Optional) The offset of the token in the text
:token_logprobs: (Optional) The log probabilities for the tokens
:tokens: (Optional) The tokens
:top_logprobs: (Optional) The top log probabilities for the tokens
"""
text_offset: Optional[List[int]] = None
token_logprobs: Optional[List[float]] = None
tokens: Optional[List[str]] = None
top_logprobs: Optional[List[Dict[str, float]]] = None
@json_schema_type
class OpenAICompletionChoice(BaseModel):
"""A choice from an OpenAI-compatible completion response.
:finish_reason: The reason the model stopped generating
:text: The text of the choice
:index: The index of the choice
:logprobs: (Optional) The log probabilities for the tokens in the choice
"""
finish_reason: str
text: str
index: int
logprobs: Optional[OpenAIChoiceLogprobs] = None
@json_schema_type
class OpenAICompletion(BaseModel):
"""Response from an OpenAI-compatible completion request.
:id: The ID of the completion
:choices: List of choices
:created: The Unix timestamp in seconds when the completion was created
:model: The model that was used to generate the completion
:object: The object type, which will be "text_completion"
"""
id: str
choices: List[OpenAICompletionChoice]
created: int
model: str
object: Literal["text_completion"] = "text_completion"
class ModelStore(Protocol):
async def get_model(self, identifier: str) -> Model: ...
@ -564,3 +775,105 @@ class Inference(Protocol):
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...
@webmethod(route="/openai/v1/completions", method="POST")
async def openai_completion(
self,
# Standard OpenAI completion parameters
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
# vLLM-specific parameters
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param prompt: The prompt to generate a completion for
:param best_of: (Optional) The number of completions to generate
:param echo: (Optional) Whether to echo the prompt
:param frequency_penalty: (Optional) The penalty for repeated tokens
:param logit_bias: (Optional) The logit bias to use
:param logprobs: (Optional) The log probabilities to use
:param max_tokens: (Optional) The maximum number of tokens to generate
:param n: (Optional) The number of completions to generate
:param presence_penalty: (Optional) The penalty for repeated tokens
:param seed: (Optional) The seed to use
:param stop: (Optional) The stop tokens to use
:param stream: (Optional) Whether to stream the response
:param stream_options: (Optional) The stream options to use
:param temperature: (Optional) The temperature to use
:param top_p: (Optional) The top p to use
:param user: (Optional) The user to use
"""
...
@webmethod(route="/openai/v1/chat/completions", method="POST")
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation
:param frequency_penalty: (Optional) The penalty for repeated tokens
:param function_call: (Optional) The function call to use
:param functions: (Optional) List of functions to use
:param logit_bias: (Optional) The logit bias to use
:param logprobs: (Optional) The log probabilities to use
:param max_completion_tokens: (Optional) The maximum number of tokens to generate
:param max_tokens: (Optional) The maximum number of tokens to generate
:param n: (Optional) The number of completions to generate
:param parallel_tool_calls: (Optional) Whether to parallelize tool calls
:param presence_penalty: (Optional) The penalty for repeated tokens
:param response_format: (Optional) The response format to use
:param seed: (Optional) The seed to use
:param stop: (Optional) The stop tokens to use
:param stream: (Optional) Whether to stream the response
:param stream_options: (Optional) The stream options to use
:param temperature: (Optional) The temperature to use
:param tool_choice: (Optional) The tool choice to use
:param tools: (Optional) The tools to use
:param top_logprobs: (Optional) The top log probabilities to use
:param top_p: (Optional) The top p to use
:param user: (Optional) The user to use
"""
...

View file

@ -56,12 +56,35 @@ class ListModelsResponse(BaseModel):
data: List[Model]
@json_schema_type
class OpenAIModel(BaseModel):
"""A model from OpenAI.
:id: The ID of the model
:object: The object type, which will be "model"
:created: The Unix timestamp in seconds when the model was created
:owned_by: The owner of the model
"""
id: str
object: Literal["model"] = "model"
created: int
owned_by: str
class OpenAIListModelsResponse(BaseModel):
data: List[OpenAIModel]
@runtime_checkable
@trace_protocol
class Models(Protocol):
@webmethod(route="/models", method="GET")
async def list_models(self) -> ListModelsResponse: ...
@webmethod(route="/openai/v1/models", method="GET")
async def openai_list_models(self) -> OpenAIListModelsResponse: ...
@webmethod(route="/models/{model_id:path}", method="GET")
async def get_model(
self,

View file

@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.apis.models import Model, ModelType
from llama_stack.apis.safety import RunShieldResponse, Safety
from llama_stack.apis.scoring import (
@ -419,6 +420,126 @@ class InferenceRouter(Inference):
task_type=task_type,
)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
logger.debug(
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
)
model_obj = await self.routing_table.get_model(model)
if model_obj is None:
raise ValueError(f"Model '{model}' not found")
if model_obj.model_type == ModelType.embedding:
raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
params = dict(
model=model_obj.identifier,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs,
)
provider = self.routing_table.get_provider_impl(model_obj.identifier)
return await provider.openai_completion(**params)
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
logger.debug(
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
)
model_obj = await self.routing_table.get_model(model)
if model_obj is None:
raise ValueError(f"Model '{model}' not found")
if model_obj.model_type == ModelType.embedding:
raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
params = dict(
model=model_obj.identifier,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
provider = self.routing_table.get_provider_impl(model_obj.identifier)
return await provider.openai_chat_completion(**params)
class SafetyRouter(Safety):
def __init__(

View file

@ -5,6 +5,7 @@
# the root directory of this source tree.
import logging
import time
import uuid
from typing import Any, Dict, List, Optional
@ -23,7 +24,7 @@ from llama_stack.apis.datasets import (
RowsDataSource,
URIDataSource,
)
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
from llama_stack.apis.resource import ResourceType
from llama_stack.apis.scoring_functions import (
ListScoringFunctionsResponse,
@ -254,6 +255,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
async def list_models(self) -> ListModelsResponse:
return ListModelsResponse(data=await self.get_all_with_type("model"))
async def openai_list_models(self) -> OpenAIListModelsResponse:
models = await self.get_all_with_type("model")
openai_models = [
OpenAIModel(
id=model.identifier,
object="model",
created=int(time.time()),
owned_by="llama_stack",
)
for model in models
]
return OpenAIListModelsResponse(data=openai_models)
async def get_model(self, model_id: str) -> Model:
model = await self.get_object_by_identifier("model", model_id)
if model is None:

View file

@ -54,6 +54,10 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
augment_content_with_response_format_prompt,
chat_completion_request_to_messages,
@ -79,6 +83,8 @@ def llama4_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama
class MetaReferenceInferenceImpl(
OpenAICompletionUnsupportedMixin,
OpenAIChatCompletionUnsupportedMixin,
SentenceTransformerEmbeddingMixin,
Inference,
ModelsProtocolPrivate,

View file

@ -23,6 +23,10 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
from llama_stack.providers.utils.inference.embedding_mixin import (
SentenceTransformerEmbeddingMixin,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
)
from .config import SentenceTransformersInferenceConfig
@ -30,6 +34,8 @@ log = logging.getLogger(__name__)
class SentenceTransformersInferenceImpl(
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
SentenceTransformerEmbeddingMixin,
Inference,
ModelsProtocolPrivate,

View file

@ -66,8 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelsProtocolPrivate,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
OpenAICompletionUnsupportedMixin,
get_stop_reason,
process_chat_completion_stream_response,
)
@ -172,7 +174,12 @@ def _convert_sampling_params(
return vllm_sampling_params
class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
class VLLMInferenceImpl(
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
ModelsProtocolPrivate,
):
"""
vLLM-based inference model adapter for Llama Stack with support for multiple models.

View file

@ -36,8 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
OpenAICompletionUnsupportedMixin,
get_sampling_strategy_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@ -51,7 +53,12 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
from .models import MODEL_ENTRIES
class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
class BedrockInferenceAdapter(
ModelRegistryHelper,
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
):
def __init__(self, config: BedrockConfig) -> None:
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
self._config = config

View file

@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@ -49,7 +51,12 @@ from .config import CerebrasImplConfig
from .models import MODEL_ENTRIES
class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
class CerebrasInferenceAdapter(
ModelRegistryHelper,
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
):
def __init__(self, config: CerebrasImplConfig) -> None:
ModelRegistryHelper.__init__(
self,

View file

@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@ -56,7 +58,12 @@ model_entries = [
]
class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
class DatabricksInferenceAdapter(
ModelRegistryHelper,
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
):
def __init__(self, config: DatabricksImplConfig) -> None:
ModelRegistryHelper.__init__(self, model_entries=model_entries)
self.config = config

View file

@ -4,9 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import AsyncGenerator, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
from fireworks.client import Fireworks
from openai import AsyncOpenAI
from llama_stack.apis.common.content_types import (
InterleavedContent,
@ -31,6 +32,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import (
@ -39,6 +41,7 @@ from llama_stack.providers.utils.inference.model_registry import (
from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict,
get_sampling_options,
prepare_openai_completion_params,
process_chat_completion_response,
process_chat_completion_stream_response,
process_completion_response,
@ -81,10 +84,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
)
return provider_data.fireworks_api_key
def _get_base_url(self) -> str:
return "https://api.fireworks.ai/inference/v1"
def _get_client(self) -> Fireworks:
fireworks_api_key = self._get_api_key()
return Fireworks(api_key=fireworks_api_key)
def _get_openai_client(self) -> AsyncOpenAI:
return AsyncOpenAI(base_url=self._get_base_url(), api_key=self._get_api_key())
async def completion(
self,
model_id: str,
@ -268,3 +277,101 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
)
return await self._get_openai_client().completions.create(**params)
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await self._get_openai_client().chat.completions.create(**params)

View file

@ -7,7 +7,7 @@
import logging
import warnings
from functools import lru_cache
from typing import AsyncIterator, List, Optional, Union
from typing import Any, AsyncIterator, Dict, List, Optional, Union
from openai import APIConnectionError, AsyncOpenAI, BadRequestError
@ -35,6 +35,7 @@ from llama_stack.apis.inference import (
ToolConfig,
ToolDefinition,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.models.llama.datatypes import ToolPromptFormat
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
@ -42,6 +43,7 @@ from llama_stack.providers.utils.inference.model_registry import (
from llama_stack.providers.utils.inference.openai_compat import (
convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream,
prepare_openai_completion_params,
)
from llama_stack.providers.utils.inference.prompt_adapter import content_has_media
@ -263,3 +265,111 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
else:
# we pass n=1 to get only one completion
return convert_openai_chat_completion_choice(response.choices[0])
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
provider_model_id = self.get_provider_model_id(model)
params = await prepare_openai_completion_params(
model=provider_model_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
)
try:
return await self._get_client(provider_model_id).completions.create(**params)
except APIConnectionError as e:
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
provider_model_id = self.get_provider_model_id(model)
params = await prepare_openai_completion_params(
model=provider_model_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
try:
return await self._get_client(provider_model_id).chat.completions.create(**params)
except APIConnectionError as e:
raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e

View file

@ -5,10 +5,11 @@
# the root directory of this source tree.
from typing import Any, AsyncGenerator, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
import httpx
from ollama import AsyncClient
from openai import AsyncOpenAI
from llama_stack.apis.common.content_types import (
ImageContentItem,
@ -38,6 +39,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.apis.models import Model, ModelType
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ModelsProtocolPrivate
@ -67,7 +69,10 @@ from .models import model_entries
logger = get_logger(name=__name__, category="inference")
class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
class OllamaInferenceAdapter(
Inference,
ModelsProtocolPrivate,
):
def __init__(self, url: str) -> None:
self.register_helper = ModelRegistryHelper(model_entries)
self.url = url
@ -76,6 +81,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
def client(self) -> AsyncClient:
return AsyncClient(host=self.url)
@property
def openai_client(self) -> AsyncOpenAI:
return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
async def initialize(self) -> None:
logger.info(f"checking connectivity to Ollama at `{self.url}`...")
try:
@ -319,6 +328,115 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
return model
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
if not isinstance(prompt, str):
raise ValueError("Ollama does not support non-string prompts for completion")
model_obj = await self._get_model(model)
params = {
k: v
for k, v in {
"model": model_obj.provider_resource_id,
"prompt": prompt,
"best_of": best_of,
"echo": echo,
"frequency_penalty": frequency_penalty,
"logit_bias": logit_bias,
"logprobs": logprobs,
"max_tokens": max_tokens,
"n": n,
"presence_penalty": presence_penalty,
"seed": seed,
"stop": stop,
"stream": stream,
"stream_options": stream_options,
"temperature": temperature,
"top_p": top_p,
"user": user,
}.items()
if v is not None
}
return await self.openai_client.completions.create(**params) # type: ignore
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
model_obj = await self._get_model(model)
params = {
k: v
for k, v in {
"model": model_obj.provider_resource_id,
"messages": messages,
"frequency_penalty": frequency_penalty,
"function_call": function_call,
"functions": functions,
"logit_bias": logit_bias,
"logprobs": logprobs,
"max_completion_tokens": max_completion_tokens,
"max_tokens": max_tokens,
"n": n,
"parallel_tool_calls": parallel_tool_calls,
"presence_penalty": presence_penalty,
"response_format": response_format,
"seed": seed,
"stop": stop,
"stream": stream,
"stream_options": stream_options,
"temperature": temperature,
"tool_choice": tool_choice,
"tools": tools,
"top_logprobs": top_logprobs,
"top_p": top_p,
"user": user,
}.items()
if v is not None
}
return await self.openai_client.chat.completions.create(**params) # type: ignore
async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
async def _convert_content(content) -> dict:

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, AsyncGenerator, Dict, List, Optional
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
from llama_stack_client import AsyncLlamaStackClient
@ -26,9 +26,11 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.apis.models import Model
from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
from .config import PassthroughImplConfig
@ -201,6 +203,112 @@ class PassthroughInferenceAdapter(Inference):
task_type=task_type,
)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
client = self._get_client()
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs,
)
return await client.inference.openai_completion(**params)
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
client = self._get_client()
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await client.inference.openai_chat_completion(**params)
def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
json_params = {}
for key, value in request_params.items():

View file

@ -12,6 +12,8 @@ from llama_stack.apis.inference import * # noqa: F403
# from llama_stack.providers.datatypes import ModelsProtocolPrivate
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@ -38,7 +40,12 @@ RUNPOD_SUPPORTED_MODELS = {
}
class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
class RunpodInferenceAdapter(
ModelRegistryHelper,
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
):
def __init__(self, config: RunpodImplConfig) -> None:
ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
self.config = config

View file

@ -42,6 +42,8 @@ from llama_stack.apis.inference import (
)
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
process_chat_completion_stream_response,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
@ -52,7 +54,12 @@ from .config import SambaNovaImplConfig
from .models import MODEL_ENTRIES
class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
class SambaNovaInferenceAdapter(
ModelRegistryHelper,
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
):
def __init__(self, config: SambaNovaImplConfig) -> None:
ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
self.config = config

View file

@ -40,8 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionUnsupportedMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
OpenAICompletionUnsupportedMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
@ -69,7 +71,12 @@ def build_hf_repo_model_entries():
]
class _HfAdapter(Inference, ModelsProtocolPrivate):
class _HfAdapter(
Inference,
OpenAIChatCompletionUnsupportedMixin,
OpenAICompletionUnsupportedMixin,
ModelsProtocolPrivate,
):
client: AsyncInferenceClient
max_tokens: int
model_id: str

View file

@ -4,8 +4,9 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import AsyncGenerator, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
from openai import AsyncOpenAI
from together import AsyncTogether
from llama_stack.apis.common.content_types import (
@ -30,12 +31,14 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict,
get_sampling_options,
prepare_openai_completion_params,
process_chat_completion_response,
process_chat_completion_stream_response,
process_completion_response,
@ -60,6 +63,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
self.config = config
self._client = None
self._openai_client = None
async def initialize(self) -> None:
pass
@ -110,6 +114,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
self._client = AsyncTogether(api_key=together_api_key)
return self._client
def _get_openai_client(self) -> AsyncOpenAI:
if not self._openai_client:
together_client = self._get_client().client
self._openai_client = AsyncOpenAI(
base_url=together_client.base_url,
api_key=together_client.api_key,
)
return self._openai_client
async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
params = await self._get_params(request)
client = self._get_client()
@ -243,3 +256,101 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
)
embeddings = [item.embedding for item in r.data]
return EmbeddingsResponse(embeddings=embeddings)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
)
return await self._get_openai_client().completions.create(**params) # type: ignore
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
model_obj = await self.model_store.get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await self._get_openai_client().chat.completions.create(**params) # type: ignore

View file

@ -5,7 +5,7 @@
# the root directory of this source tree.
import json
import logging
from typing import Any, AsyncGenerator, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, List, Optional, Union
import httpx
from openai import AsyncOpenAI
@ -45,6 +45,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.apis.models import Model, ModelType
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
from llama_stack.models.llama.sku_list import all_registered_models
@ -58,6 +59,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict,
convert_tool_call,
get_sampling_options,
prepare_openai_completion_params,
process_chat_completion_stream_response,
process_completion_response,
process_completion_stream_response,
@ -418,3 +420,109 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
model_obj = await self._get_model(model)
extra_body: Dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
extra_body=extra_body,
)
return await self.client.completions.create(**params) # type: ignore
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await self.client.chat.completions.create(**params) # type: ignore

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
import litellm
@ -30,6 +30,7 @@ from llama_stack.apis.inference import (
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam
from llama_stack.apis.models.models import Model
from llama_stack.distribution.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger
@ -40,6 +41,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
convert_openai_chat_completion_stream,
convert_tooldef_to_openai_tool,
get_sampling_options,
prepare_openai_completion_params,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str,
@ -245,3 +247,103 @@ class LiteLLMOpenAIMixin(
embeddings = [data["embedding"] for data in response["data"]]
return EmbeddingsResponse(embeddings=embeddings)
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
guided_choice=guided_choice,
prompt_logprobs=prompt_logprobs,
)
return litellm.text_completion(**params)
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIMessageParam],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return litellm.completion(**params)

View file

@ -5,8 +5,10 @@
# the root directory of this source tree.
import json
import logging
import time
import uuid
import warnings
from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union
from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union
from openai import AsyncStream
from openai.types.chat import (
@ -83,6 +85,7 @@ from llama_stack.apis.inference import (
TopPSamplingStrategy,
UserMessage,
)
from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice
from llama_stack.models.llama.datatypes import (
BuiltinTool,
StopReason,
@ -843,6 +846,31 @@ def _convert_openai_logprobs(
]
def _convert_openai_sampling_params(
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
) -> SamplingParams:
sampling_params = SamplingParams()
if max_tokens:
sampling_params.max_tokens = max_tokens
# Map an explicit temperature of 0 to greedy sampling
if temperature == 0:
strategy = GreedySamplingStrategy()
else:
# OpenAI defaults to 1.0 for temperature and top_p if unset
if temperature is None:
temperature = 1.0
if top_p is None:
top_p = 1.0
strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
sampling_params.strategy = strategy
return sampling_params
def convert_openai_chat_completion_choice(
choice: OpenAIChoice,
) -> ChatCompletionResponse:
@ -1049,3 +1077,106 @@ async def convert_openai_chat_completion_stream(
stop_reason=stop_reason,
)
)
async def prepare_openai_completion_params(**params):
completion_params = {k: v for k, v in params.items() if v is not None}
return completion_params
class OpenAICompletionUnsupportedMixin:
async def openai_completion(
self,
model: str,
prompt: Union[str, List[str], List[int], List[List[int]]],
best_of: Optional[int] = None,
echo: Optional[bool] = None,
frequency_penalty: Optional[float] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
guided_choice: Optional[List[str]] = None,
prompt_logprobs: Optional[int] = None,
) -> OpenAICompletion:
if stream:
raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
# This is a pretty hacky way to do emulate completions -
# basically just de-batches them...
prompts = [prompt] if not isinstance(prompt, list) else prompt
sampling_params = _convert_openai_sampling_params(
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
choices = []
# "n" is the number of completions to generate per prompt
for _i in range(0, n):
# and we may have multiple prompts, if batching was used
for prompt in prompts:
result = self.completion(
model_id=model,
content=prompt,
sampling_params=sampling_params,
)
index = len(choices)
text = result.content
finish_reason = _convert_openai_finish_reason(result.stop_reason)
choice = OpenAICompletionChoice(
index=index,
text=text,
finish_reason=finish_reason,
)
choices.append(choice)
return OpenAICompletion(
id=f"cmpl-{uuid.uuid4()}",
choices=choices,
created=int(time.time()),
model=model,
object="text_completion",
)
class OpenAIChatCompletionUnsupportedMixin:
async def openai_chat_completion(
self,
model: str,
messages: List[OpenAIChatCompletionMessage],
frequency_penalty: Optional[float] = None,
function_call: Optional[Union[str, Dict[str, Any]]] = None,
functions: Optional[List[Dict[str, Any]]] = None,
logit_bias: Optional[Dict[str, float]] = None,
logprobs: Optional[bool] = None,
max_completion_tokens: Optional[int] = None,
max_tokens: Optional[int] = None,
n: Optional[int] = None,
parallel_tool_calls: Optional[bool] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[Dict[str, str]] = None,
seed: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
stream: Optional[bool] = None,
stream_options: Optional[Dict[str, Any]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
tools: Optional[List[Dict[str, Any]]] = None,
top_logprobs: Optional[int] = None,
top_p: Optional[float] = None,
user: Optional[str] = None,
) -> OpenAIChatCompletion:
raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion")

View file

@ -28,6 +28,7 @@ dependencies = [
"jinja2>=3.1.6",
"jsonschema",
"llama-stack-client>=0.2.1",
"openai>=1.66",
"prompt-toolkit",
"python-dotenv",
"pydantic>=2",

View file

@ -19,6 +19,7 @@ httpx==0.28.1
huggingface-hub==0.29.0
idna==3.10
jinja2==3.1.6
jiter==0.8.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
llama-stack-client==0.2.1
@ -27,6 +28,7 @@ markdown-it-py==3.0.0
markupsafe==3.0.2
mdurl==0.1.2
numpy==2.2.3
openai==1.71.0
packaging==24.2
pandas==2.2.3
pillow==11.1.0

View file

@ -0,0 +1,216 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import pytest
from openai import OpenAI
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
from ..test_cases.test_case import TestCase
def provider_from_model(client_with_models, model_id):
models = {m.identifier: m for m in client_with_models.models.list()}
models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
provider_id = models[model_id].provider_id
providers = {p.provider_id: p for p in client_with_models.providers.list()}
return providers[provider_id]
def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI completions are not supported when testing with library client yet.")
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"inline::meta-reference",
"inline::sentence-transformers",
"inline::vllm",
"remote::bedrock",
"remote::cerebras",
"remote::databricks",
# Technically Nvidia does support OpenAI completions, but none of their hosted models
# support both completions and chat completions endpoint and all the Llama models are
# just chat completions
"remote::nvidia",
"remote::runpod",
"remote::sambanova",
"remote::tgi",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
if isinstance(client_with_models, LlamaStackAsLibraryClient):
pytest.skip("OpenAI chat completions are not supported when testing with library client yet.")
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"inline::meta-reference",
"inline::sentence-transformers",
"inline::vllm",
"remote::bedrock",
"remote::cerebras",
"remote::databricks",
"remote::runpod",
"remote::sambanova",
"remote::tgi",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")
def skip_if_provider_isnt_vllm(client_with_models, model_id):
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type != "remote::vllm":
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.")
@pytest.fixture
def openai_client(client_with_models):
base_url = f"{client_with_models.base_url}/v1/openai/v1"
return OpenAI(base_url=base_url, api_key="bar")
@pytest.mark.parametrize(
"test_case",
[
"inference:completion:sanity",
],
)
def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
# ollama needs more verbose prompting for some reason here...
prompt = "Respond to this question and explain your answer. " + tc["content"]
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
)
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.text) > 10
@pytest.mark.parametrize(
"test_case",
[
"inference:completion:sanity",
],
)
def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
# ollama needs more verbose prompting for some reason here...
prompt = "Respond to this question and explain your answer. " + tc["content"]
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=True,
max_tokens=50,
)
streamed_content = [chunk.choices[0].text for chunk in response]
content_str = "".join(streamed_content).lower().strip()
assert len(content_str) > 10
@pytest.mark.parametrize(
"prompt_logprobs",
[
1,
0,
],
)
def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
prompt = "Hello, world!"
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
extra_body={
"prompt_logprobs": prompt_logprobs,
},
)
assert len(response.choices) > 0
choice = response.choices[0]
assert len(choice.prompt_logprobs) > 0
def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
skip_if_provider_isnt_vllm(client_with_models, text_model_id)
prompt = "I am feeling really sad today."
response = openai_client.completions.create(
model=text_model_id,
prompt=prompt,
stream=False,
extra_body={
"guided_choice": ["joy", "sadness"],
},
)
assert len(response.choices) > 0
choice = response.choices[0]
assert choice.text in ["joy", "sadness"]
@pytest.mark.parametrize(
"test_case",
[
"inference:chat_completion:non_streaming_01",
"inference:chat_completion:non_streaming_02",
],
)
def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
question = tc["question"]
expected = tc["expected"]
response = openai_client.chat.completions.create(
model=text_model_id,
messages=[
{
"role": "user",
"content": question,
}
],
stream=False,
)
message_content = response.choices[0].message.content.lower().strip()
assert len(message_content) > 0
assert expected.lower() in message_content
@pytest.mark.parametrize(
"test_case",
[
"inference:chat_completion:streaming_01",
"inference:chat_completion:streaming_02",
],
)
def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
tc = TestCase(test_case)
question = tc["question"]
expected = tc["expected"]
response = openai_client.chat.completions.create(
model=text_model_id,
messages=[{"role": "user", "content": question}],
stream=True,
timeout=120, # Increase timeout to 2 minutes for large conversation history
)
streamed_content = []
for chunk in response:
if chunk.choices[0].delta.content:
streamed_content.append(chunk.choices[0].delta.content.lower().strip())
assert len(streamed_content) > 0
assert expected.lower() in "".join(streamed_content)

8
uv.lock generated
View file

@ -1384,6 +1384,7 @@ dependencies = [
{ name = "jinja2" },
{ name = "jsonschema" },
{ name = "llama-stack-client" },
{ name = "openai" },
{ name = "pillow" },
{ name = "prompt-toolkit" },
{ name = "pydantic" },
@ -1485,6 +1486,7 @@ requires-dist = [
{ name = "mcp", marker = "extra == 'test'" },
{ name = "myst-parser", marker = "extra == 'docs'" },
{ name = "nbval", marker = "extra == 'dev'" },
{ name = "openai", specifier = ">=1.66" },
{ name = "openai", marker = "extra == 'test'" },
{ name = "openai", marker = "extra == 'unit'" },
{ name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
@ -2016,7 +2018,7 @@ wheels = [
[[package]]
name = "openai"
version = "1.63.2"
version = "1.71.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
@ -2028,9 +2030,9 @@ dependencies = [
{ name = "tqdm" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 }
sdist = { url = "https://files.pythonhosted.org/packages/d9/19/b8f0347090a649dce55a008ec54ac6abb50553a06508cdb5e7abb2813e99/openai-1.71.0.tar.gz", hash = "sha256:52b20bb990a1780f9b0b8ccebac93416343ebd3e4e714e3eff730336833ca207", size = 409926 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 },
{ url = "https://files.pythonhosted.org/packages/c4/f7/049e85faf6a000890e5ca0edca8e9183f8a43c9e7bba869cad871da0caba/openai-1.71.0-py3-none-any.whl", hash = "sha256:e1c643738f1fff1af52bce6ef06a7716c95d089281e7011777179614f32937aa", size = 598975 },
]
[[package]]