diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 567110829..36bfad49e 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -3092,6 +3092,125 @@ } } }, + "/v1/openai/v1/chat/completions": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAIChatCompletion" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenaiChatCompletionRequest" + } + } + }, + "required": true + } + } + }, + "/v1/openai/v1/completions": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAICompletion" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Inference" + ], + "description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", + "parameters": [], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenaiCompletionRequest" + } + } + }, + "required": true + } + } + }, + "/v1/openai/v1/models": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/OpenAIListModelsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Models" + ], + "description": "", + "parameters": [] + } + }, "/v1/post-training/preference-optimize": { "post": { "responses": { @@ -8713,6 +8832,819 @@ ], "title": "LogEventRequest" }, + "OpenAIAssistantMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "assistant", + "default": "assistant", + "description": "Must be \"assistant\" to identify this as the model's response" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the model's response" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the assistant message participant." + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolCall" + }, + "description": "List of tool calls. Each tool call is a ToolCall object." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIAssistantMessageParam", + "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." + }, + "OpenAIDeveloperMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "developer", + "default": "developer", + "description": "Must be \"developer\" to identify this as a developer message" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the developer message" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the developer message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIDeveloperMessageParam", + "description": "A message from the developer in an OpenAI-compatible chat completion request." + }, + "OpenAIMessageParam": { + "oneOf": [ + { + "$ref": "#/components/schemas/OpenAIUserMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAISystemMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIToolMessageParam" + }, + { + "$ref": "#/components/schemas/OpenAIDeveloperMessageParam" + } + ], + "discriminator": { + "propertyName": "role", + "mapping": { + "user": "#/components/schemas/OpenAIUserMessageParam", + "system": "#/components/schemas/OpenAISystemMessageParam", + "assistant": "#/components/schemas/OpenAIAssistantMessageParam", + "tool": "#/components/schemas/OpenAIToolMessageParam", + "developer": "#/components/schemas/OpenAIDeveloperMessageParam" + } + } + }, + "OpenAISystemMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "system", + "default": "system", + "description": "Must be \"system\" to identify this as a system message" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)." + }, + "name": { + "type": "string", + "description": "(Optional) The name of the system message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAISystemMessageParam", + "description": "A system message providing instructions or context to the model." + }, + "OpenAIToolMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "tool", + "default": "tool", + "description": "Must be \"tool\" to identify this as a tool response" + }, + "tool_call_id": { + "type": "string", + "description": "Unique identifier for the tool call this response is for" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The response content from the tool" + } + }, + "additionalProperties": false, + "required": [ + "role", + "tool_call_id", + "content" + ], + "title": "OpenAIToolMessageParam", + "description": "A message representing the result of a tool invocation in an OpenAI-compatible chat completion request." + }, + "OpenAIUserMessageParam": { + "type": "object", + "properties": { + "role": { + "type": "string", + "const": "user", + "default": "user", + "description": "Must be \"user\" to identify this as a user message" + }, + "content": { + "$ref": "#/components/schemas/InterleavedContent", + "description": "The content of the message, which can include text and other media" + }, + "name": { + "type": "string", + "description": "(Optional) The name of the user message participant." + } + }, + "additionalProperties": false, + "required": [ + "role", + "content" + ], + "title": "OpenAIUserMessageParam", + "description": "A message from the user in an OpenAI-compatible chat completion request." + }, + "OpenaiChatCompletionRequest": { + "type": "object", + "properties": { + "model": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." + }, + "messages": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIMessageParam" + }, + "description": "List of messages in the conversation" + }, + "frequency_penalty": { + "type": "number", + "description": "(Optional) The penalty for repeated tokens" + }, + "function_call": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + ], + "description": "(Optional) The function call to use" + }, + "functions": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "(Optional) List of functions to use" + }, + "logit_bias": { + "type": "object", + "additionalProperties": { + "type": "number" + }, + "description": "(Optional) The logit bias to use" + }, + "logprobs": { + "type": "boolean", + "description": "(Optional) The log probabilities to use" + }, + "max_completion_tokens": { + "type": "integer", + "description": "(Optional) The maximum number of tokens to generate" + }, + "max_tokens": { + "type": "integer", + "description": "(Optional) The maximum number of tokens to generate" + }, + "n": { + "type": "integer", + "description": "(Optional) The number of completions to generate" + }, + "parallel_tool_calls": { + "type": "boolean", + "description": "(Optional) Whether to parallelize tool calls" + }, + "presence_penalty": { + "type": "number", + "description": "(Optional) The penalty for repeated tokens" + }, + "response_format": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "(Optional) The response format to use" + }, + "seed": { + "type": "integer", + "description": "(Optional) The seed to use" + }, + "stop": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "(Optional) The stop tokens to use" + }, + "stream": { + "type": "boolean", + "description": "(Optional) Whether to stream the response" + }, + "stream_options": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "(Optional) The stream options to use" + }, + "temperature": { + "type": "number", + "description": "(Optional) The temperature to use" + }, + "tool_choice": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + ], + "description": "(Optional) The tool choice to use" + }, + "tools": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "description": "(Optional) The tools to use" + }, + "top_logprobs": { + "type": "integer", + "description": "(Optional) The top log probabilities to use" + }, + "top_p": { + "type": "number", + "description": "(Optional) The top p to use" + }, + "user": { + "type": "string", + "description": "(Optional) The user to use" + } + }, + "additionalProperties": false, + "required": [ + "model", + "messages" + ], + "title": "OpenaiChatCompletionRequest" + }, + "OpenAIChatCompletion": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The ID of the chat completion" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIChoice" + }, + "description": "List of choices" + }, + "object": { + "type": "string", + "const": "chat.completion", + "default": "chat.completion", + "description": "The object type, which will be \"chat.completion\"" + }, + "created": { + "type": "integer", + "description": "The Unix timestamp in seconds when the chat completion was created" + }, + "model": { + "type": "string", + "description": "The model that was used to generate the chat completion" + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "object", + "created", + "model" + ], + "title": "OpenAIChatCompletion", + "description": "Response from an OpenAI-compatible chat completion request." + }, + "OpenAIChoice": { + "type": "object", + "properties": { + "message": { + "$ref": "#/components/schemas/OpenAIMessageParam", + "description": "The message from the model" + }, + "finish_reason": { + "type": "string", + "description": "The reason the model stopped generating" + }, + "index": { + "type": "integer" + }, + "logprobs": { + "$ref": "#/components/schemas/OpenAIChoiceLogprobs" + } + }, + "additionalProperties": false, + "required": [ + "message", + "finish_reason", + "index" + ], + "title": "OpenAIChoice", + "description": "A choice from an OpenAI-compatible chat completion response." + }, + "OpenAIChoiceLogprobs": { + "type": "object", + "properties": { + "content": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + } + }, + "refusal": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITokenLogProb" + } + } + }, + "additionalProperties": false, + "title": "OpenAIChoiceLogprobs", + "description": "The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response." + }, + "OpenAITokenLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + }, + "top_logprobs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAITopLogProb" + } + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob", + "top_logprobs" + ], + "title": "OpenAITokenLogProb", + "description": "The log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenAITopLogProb": { + "type": "object", + "properties": { + "token": { + "type": "string" + }, + "bytes": { + "type": "array", + "items": { + "type": "integer" + } + }, + "logprob": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "token", + "logprob" + ], + "title": "OpenAITopLogProb", + "description": "The top log probability for a token from an OpenAI-compatible chat completion response." + }, + "OpenaiCompletionRequest": { + "type": "object", + "properties": { + "model": { + "type": "string", + "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint." + }, + "prompt": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "array", + "items": { + "type": "integer" + } + }, + { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "integer" + } + } + } + ], + "description": "The prompt to generate a completion for" + }, + "best_of": { + "type": "integer", + "description": "(Optional) The number of completions to generate" + }, + "echo": { + "type": "boolean", + "description": "(Optional) Whether to echo the prompt" + }, + "frequency_penalty": { + "type": "number", + "description": "(Optional) The penalty for repeated tokens" + }, + "logit_bias": { + "type": "object", + "additionalProperties": { + "type": "number" + }, + "description": "(Optional) The logit bias to use" + }, + "logprobs": { + "type": "boolean", + "description": "(Optional) The log probabilities to use" + }, + "max_tokens": { + "type": "integer", + "description": "(Optional) The maximum number of tokens to generate" + }, + "n": { + "type": "integer", + "description": "(Optional) The number of completions to generate" + }, + "presence_penalty": { + "type": "number", + "description": "(Optional) The penalty for repeated tokens" + }, + "seed": { + "type": "integer", + "description": "(Optional) The seed to use" + }, + "stop": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "(Optional) The stop tokens to use" + }, + "stream": { + "type": "boolean", + "description": "(Optional) Whether to stream the response" + }, + "stream_options": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + }, + "description": "(Optional) The stream options to use" + }, + "temperature": { + "type": "number", + "description": "(Optional) The temperature to use" + }, + "top_p": { + "type": "number", + "description": "(Optional) The top p to use" + }, + "user": { + "type": "string", + "description": "(Optional) The user to use" + }, + "guided_choice": { + "type": "array", + "items": { + "type": "string" + } + }, + "prompt_logprobs": { + "type": "integer" + } + }, + "additionalProperties": false, + "required": [ + "model", + "prompt" + ], + "title": "OpenaiCompletionRequest" + }, + "OpenAICompletion": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAICompletionChoice" + } + }, + "created": { + "type": "integer" + }, + "model": { + "type": "string" + }, + "object": { + "type": "string", + "const": "text_completion", + "default": "text_completion" + } + }, + "additionalProperties": false, + "required": [ + "id", + "choices", + "created", + "model", + "object" + ], + "title": "OpenAICompletion", + "description": "Response from an OpenAI-compatible completion request." + }, + "OpenAICompletionChoice": { + "type": "object", + "properties": { + "finish_reason": { + "type": "string" + }, + "text": { + "type": "string" + }, + "index": { + "type": "integer" + }, + "logprobs": { + "$ref": "#/components/schemas/OpenAIChoiceLogprobs" + } + }, + "additionalProperties": false, + "required": [ + "finish_reason", + "text", + "index" + ], + "title": "OpenAICompletionChoice", + "description": "A choice from an OpenAI-compatible completion response." + }, + "OpenAIModel": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "object": { + "type": "string", + "const": "model", + "default": "model" + }, + "created": { + "type": "integer" + }, + "owned_by": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "id", + "object", + "created", + "owned_by" + ], + "title": "OpenAIModel", + "description": "A model from OpenAI." + }, + "OpenAIListModelsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/OpenAIModel" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "OpenAIListModelsResponse" + }, "DPOAlignmentConfig": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1dfd17f55..82faf450a 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2131,6 +2131,91 @@ paths: schema: $ref: '#/components/schemas/LogEventRequest' required: true + /v1/openai/v1/chat/completions: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAIChatCompletion' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: >- + Generate an OpenAI-compatible chat completion for the given messages using + the specified model. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/OpenaiChatCompletionRequest' + required: true + /v1/openai/v1/completions: + post: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAICompletion' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Inference + description: >- + Generate an OpenAI-compatible completion for the given prompt using the specified + model. + parameters: [] + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/OpenaiCompletionRequest' + required: true + /v1/openai/v1/models: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/OpenAIListModelsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Models + description: '' + parameters: [] /v1/post-training/preference-optimize: post: responses: @@ -5980,6 +6065,586 @@ components: - event - ttl_seconds title: LogEventRequest + OpenAIAssistantMessageParam: + type: object + properties: + role: + type: string + const: assistant + default: assistant + description: >- + Must be "assistant" to identify this as the model's response + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content of the model's response + name: + type: string + description: >- + (Optional) The name of the assistant message participant. + tool_calls: + type: array + items: + $ref: '#/components/schemas/ToolCall' + description: >- + List of tool calls. Each tool call is a ToolCall object. + additionalProperties: false + required: + - role + - content + title: OpenAIAssistantMessageParam + description: >- + A message containing the model's (assistant) response in an OpenAI-compatible + chat completion request. + OpenAIDeveloperMessageParam: + type: object + properties: + role: + type: string + const: developer + default: developer + description: >- + Must be "developer" to identify this as a developer message + content: + $ref: '#/components/schemas/InterleavedContent' + description: The content of the developer message + name: + type: string + description: >- + (Optional) The name of the developer message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIDeveloperMessageParam + description: >- + A message from the developer in an OpenAI-compatible chat completion request. + OpenAIMessageParam: + oneOf: + - $ref: '#/components/schemas/OpenAIUserMessageParam' + - $ref: '#/components/schemas/OpenAISystemMessageParam' + - $ref: '#/components/schemas/OpenAIAssistantMessageParam' + - $ref: '#/components/schemas/OpenAIToolMessageParam' + - $ref: '#/components/schemas/OpenAIDeveloperMessageParam' + discriminator: + propertyName: role + mapping: + user: '#/components/schemas/OpenAIUserMessageParam' + system: '#/components/schemas/OpenAISystemMessageParam' + assistant: '#/components/schemas/OpenAIAssistantMessageParam' + tool: '#/components/schemas/OpenAIToolMessageParam' + developer: '#/components/schemas/OpenAIDeveloperMessageParam' + OpenAISystemMessageParam: + type: object + properties: + role: + type: string + const: system + default: system + description: >- + Must be "system" to identify this as a system message + content: + $ref: '#/components/schemas/InterleavedContent' + description: >- + The content of the "system prompt". If multiple system messages are provided, + they are concatenated. The underlying Llama Stack code may also add other + system messages (for example, for formatting tool definitions). + name: + type: string + description: >- + (Optional) The name of the system message participant. + additionalProperties: false + required: + - role + - content + title: OpenAISystemMessageParam + description: >- + A system message providing instructions or context to the model. + OpenAIToolMessageParam: + type: object + properties: + role: + type: string + const: tool + default: tool + description: >- + Must be "tool" to identify this as a tool response + tool_call_id: + type: string + description: >- + Unique identifier for the tool call this response is for + content: + $ref: '#/components/schemas/InterleavedContent' + description: The response content from the tool + additionalProperties: false + required: + - role + - tool_call_id + - content + title: OpenAIToolMessageParam + description: >- + A message representing the result of a tool invocation in an OpenAI-compatible + chat completion request. + OpenAIUserMessageParam: + type: object + properties: + role: + type: string + const: user + default: user + description: >- + Must be "user" to identify this as a user message + content: + $ref: '#/components/schemas/InterleavedContent' + description: >- + The content of the message, which can include text and other media + name: + type: string + description: >- + (Optional) The name of the user message participant. + additionalProperties: false + required: + - role + - content + title: OpenAIUserMessageParam + description: >- + A message from the user in an OpenAI-compatible chat completion request. + OpenaiChatCompletionRequest: + type: object + properties: + model: + type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. + messages: + type: array + items: + $ref: '#/components/schemas/OpenAIMessageParam' + description: List of messages in the conversation + frequency_penalty: + type: number + description: >- + (Optional) The penalty for repeated tokens + function_call: + oneOf: + - type: string + - type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) The function call to use + functions: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) List of functions to use + logit_bias: + type: object + additionalProperties: + type: number + description: (Optional) The logit bias to use + logprobs: + type: boolean + description: (Optional) The log probabilities to use + max_completion_tokens: + type: integer + description: >- + (Optional) The maximum number of tokens to generate + max_tokens: + type: integer + description: >- + (Optional) The maximum number of tokens to generate + n: + type: integer + description: >- + (Optional) The number of completions to generate + parallel_tool_calls: + type: boolean + description: >- + (Optional) Whether to parallelize tool calls + presence_penalty: + type: number + description: >- + (Optional) The penalty for repeated tokens + response_format: + type: object + additionalProperties: + type: string + description: (Optional) The response format to use + seed: + type: integer + description: (Optional) The seed to use + stop: + oneOf: + - type: string + - type: array + items: + type: string + description: (Optional) The stop tokens to use + stream: + type: boolean + description: >- + (Optional) Whether to stream the response + stream_options: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) The stream options to use + temperature: + type: number + description: (Optional) The temperature to use + tool_choice: + oneOf: + - type: string + - type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) The tool choice to use + tools: + type: array + items: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) The tools to use + top_logprobs: + type: integer + description: >- + (Optional) The top log probabilities to use + top_p: + type: number + description: (Optional) The top p to use + user: + type: string + description: (Optional) The user to use + additionalProperties: false + required: + - model + - messages + title: OpenaiChatCompletionRequest + OpenAIChatCompletion: + type: object + properties: + id: + type: string + description: The ID of the chat completion + choices: + type: array + items: + $ref: '#/components/schemas/OpenAIChoice' + description: List of choices + object: + type: string + const: chat.completion + default: chat.completion + description: >- + The object type, which will be "chat.completion" + created: + type: integer + description: >- + The Unix timestamp in seconds when the chat completion was created + model: + type: string + description: >- + The model that was used to generate the chat completion + additionalProperties: false + required: + - id + - choices + - object + - created + - model + title: OpenAIChatCompletion + description: >- + Response from an OpenAI-compatible chat completion request. + OpenAIChoice: + type: object + properties: + message: + $ref: '#/components/schemas/OpenAIMessageParam' + description: The message from the model + finish_reason: + type: string + description: The reason the model stopped generating + index: + type: integer + logprobs: + $ref: '#/components/schemas/OpenAIChoiceLogprobs' + additionalProperties: false + required: + - message + - finish_reason + - index + title: OpenAIChoice + description: >- + A choice from an OpenAI-compatible chat completion response. + OpenAIChoiceLogprobs: + type: object + properties: + content: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + refusal: + type: array + items: + $ref: '#/components/schemas/OpenAITokenLogProb' + additionalProperties: false + title: OpenAIChoiceLogprobs + description: >- + The log probabilities for the tokens in the message from an OpenAI-compatible + chat completion response. + OpenAITokenLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + top_logprobs: + type: array + items: + $ref: '#/components/schemas/OpenAITopLogProb' + additionalProperties: false + required: + - token + - logprob + - top_logprobs + title: OpenAITokenLogProb + description: >- + The log probability for a token from an OpenAI-compatible chat completion + response. + OpenAITopLogProb: + type: object + properties: + token: + type: string + bytes: + type: array + items: + type: integer + logprob: + type: number + additionalProperties: false + required: + - token + - logprob + title: OpenAITopLogProb + description: >- + The top log probability for a token from an OpenAI-compatible chat completion + response. + OpenaiCompletionRequest: + type: object + properties: + model: + type: string + description: >- + The identifier of the model to use. The model must be registered with + Llama Stack and available via the /models endpoint. + prompt: + oneOf: + - type: string + - type: array + items: + type: string + - type: array + items: + type: integer + - type: array + items: + type: array + items: + type: integer + description: The prompt to generate a completion for + best_of: + type: integer + description: >- + (Optional) The number of completions to generate + echo: + type: boolean + description: (Optional) Whether to echo the prompt + frequency_penalty: + type: number + description: >- + (Optional) The penalty for repeated tokens + logit_bias: + type: object + additionalProperties: + type: number + description: (Optional) The logit bias to use + logprobs: + type: boolean + description: (Optional) The log probabilities to use + max_tokens: + type: integer + description: >- + (Optional) The maximum number of tokens to generate + n: + type: integer + description: >- + (Optional) The number of completions to generate + presence_penalty: + type: number + description: >- + (Optional) The penalty for repeated tokens + seed: + type: integer + description: (Optional) The seed to use + stop: + oneOf: + - type: string + - type: array + items: + type: string + description: (Optional) The stop tokens to use + stream: + type: boolean + description: >- + (Optional) Whether to stream the response + stream_options: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: (Optional) The stream options to use + temperature: + type: number + description: (Optional) The temperature to use + top_p: + type: number + description: (Optional) The top p to use + user: + type: string + description: (Optional) The user to use + guided_choice: + type: array + items: + type: string + prompt_logprobs: + type: integer + additionalProperties: false + required: + - model + - prompt + title: OpenaiCompletionRequest + OpenAICompletion: + type: object + properties: + id: + type: string + choices: + type: array + items: + $ref: '#/components/schemas/OpenAICompletionChoice' + created: + type: integer + model: + type: string + object: + type: string + const: text_completion + default: text_completion + additionalProperties: false + required: + - id + - choices + - created + - model + - object + title: OpenAICompletion + description: >- + Response from an OpenAI-compatible completion request. + OpenAICompletionChoice: + type: object + properties: + finish_reason: + type: string + text: + type: string + index: + type: integer + logprobs: + $ref: '#/components/schemas/OpenAIChoiceLogprobs' + additionalProperties: false + required: + - finish_reason + - text + - index + title: OpenAICompletionChoice + description: >- + A choice from an OpenAI-compatible completion response. + OpenAIModel: + type: object + properties: + id: + type: string + object: + type: string + const: model + default: model + created: + type: integer + owned_by: + type: string + additionalProperties: false + required: + - id + - object + - created + - owned_by + title: OpenAIModel + description: A model from OpenAI. + OpenAIListModelsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/OpenAIModel' + additionalProperties: false + required: + - data + title: OpenAIListModelsResponse DPOAlignmentConfig: type: object properties: diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index e59132e33..3390a3fef 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -442,6 +442,217 @@ class EmbeddingsResponse(BaseModel): embeddings: List[List[float]] +@json_schema_type +class OpenAIUserMessageParam(BaseModel): + """A message from the user in an OpenAI-compatible chat completion request. + + :param role: Must be "user" to identify this as a user message + :param content: The content of the message, which can include text and other media + :param name: (Optional) The name of the user message participant. + """ + + role: Literal["user"] = "user" + content: InterleavedContent + name: Optional[str] = None + + +@json_schema_type +class OpenAISystemMessageParam(BaseModel): + """A system message providing instructions or context to the model. + + :param role: Must be "system" to identify this as a system message + :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions). + :param name: (Optional) The name of the system message participant. + """ + + role: Literal["system"] = "system" + content: InterleavedContent + name: Optional[str] = None + + +@json_schema_type +class OpenAIAssistantMessageParam(BaseModel): + """A message containing the model's (assistant) response in an OpenAI-compatible chat completion request. + + :param role: Must be "assistant" to identify this as the model's response + :param content: The content of the model's response + :param name: (Optional) The name of the assistant message participant. + :param tool_calls: List of tool calls. Each tool call is a ToolCall object. + """ + + role: Literal["assistant"] = "assistant" + content: InterleavedContent + name: Optional[str] = None + tool_calls: Optional[List[ToolCall]] = Field(default_factory=list) + + +@json_schema_type +class OpenAIToolMessageParam(BaseModel): + """A message representing the result of a tool invocation in an OpenAI-compatible chat completion request. + + :param role: Must be "tool" to identify this as a tool response + :param tool_call_id: Unique identifier for the tool call this response is for + :param content: The response content from the tool + """ + + role: Literal["tool"] = "tool" + tool_call_id: str + content: InterleavedContent + + +@json_schema_type +class OpenAIDeveloperMessageParam(BaseModel): + """A message from the developer in an OpenAI-compatible chat completion request. + + :param role: Must be "developer" to identify this as a developer message + :param content: The content of the developer message + :param name: (Optional) The name of the developer message participant. + """ + + role: Literal["developer"] = "developer" + content: InterleavedContent + name: Optional[str] = None + + +OpenAIMessageParam = Annotated[ + Union[ + OpenAIUserMessageParam, + OpenAISystemMessageParam, + OpenAIAssistantMessageParam, + OpenAIToolMessageParam, + OpenAIDeveloperMessageParam, + ], + Field(discriminator="role"), +] +register_schema(OpenAIMessageParam, name="OpenAIMessageParam") + + +@json_schema_type +class OpenAITopLogProb(BaseModel): + """The top log probability for a token from an OpenAI-compatible chat completion response. + + :token: The token + :bytes: (Optional) The bytes for the token + :logprob: The log probability of the token + """ + + token: str + bytes: Optional[List[int]] = None + logprob: float + + +@json_schema_type +class OpenAITokenLogProb(BaseModel): + """The log probability for a token from an OpenAI-compatible chat completion response. + + :token: The token + :bytes: (Optional) The bytes for the token + :logprob: The log probability of the token + :top_logprobs: The top log probabilities for the token + """ + + token: str + bytes: Optional[List[int]] = None + logprob: float + top_logprobs: List[OpenAITopLogProb] + + +@json_schema_type +class OpenAIChoiceLogprobs(BaseModel): + """The log probabilities for the tokens in the message from an OpenAI-compatible chat completion response. + + :content: (Optional) The log probabilities for the tokens in the message + :refusal: (Optional) The log probabilities for the tokens in the message + """ + + content: Optional[List[OpenAITokenLogProb]] = None + refusal: Optional[List[OpenAITokenLogProb]] = None + + +@json_schema_type +class OpenAIChoice(BaseModel): + """A choice from an OpenAI-compatible chat completion response. + + :param message: The message from the model + :param finish_reason: The reason the model stopped generating + :index: The index of the choice + :logprobs: (Optional) The log probabilities for the tokens in the message + """ + + message: OpenAIMessageParam + finish_reason: str + index: int + logprobs: Optional[OpenAIChoiceLogprobs] = None + + +@json_schema_type +class OpenAIChatCompletion(BaseModel): + """Response from an OpenAI-compatible chat completion request. + + :param id: The ID of the chat completion + :param choices: List of choices + :param object: The object type, which will be "chat.completion" + :param created: The Unix timestamp in seconds when the chat completion was created + :param model: The model that was used to generate the chat completion + """ + + id: str + choices: List[OpenAIChoice] + object: Literal["chat.completion"] = "chat.completion" + created: int + model: str + + +@json_schema_type +class OpenAICompletionLogprobs(BaseModel): + """The log probabilities for the tokens in the message from an OpenAI-compatible completion response. + + :text_offset: (Optional) The offset of the token in the text + :token_logprobs: (Optional) The log probabilities for the tokens + :tokens: (Optional) The tokens + :top_logprobs: (Optional) The top log probabilities for the tokens + """ + + text_offset: Optional[List[int]] = None + token_logprobs: Optional[List[float]] = None + tokens: Optional[List[str]] = None + top_logprobs: Optional[List[Dict[str, float]]] = None + + +@json_schema_type +class OpenAICompletionChoice(BaseModel): + """A choice from an OpenAI-compatible completion response. + + :finish_reason: The reason the model stopped generating + :text: The text of the choice + :index: The index of the choice + :logprobs: (Optional) The log probabilities for the tokens in the choice + """ + + finish_reason: str + text: str + index: int + logprobs: Optional[OpenAIChoiceLogprobs] = None + + +@json_schema_type +class OpenAICompletion(BaseModel): + """Response from an OpenAI-compatible completion request. + + :id: The ID of the completion + :choices: List of choices + :created: The Unix timestamp in seconds when the completion was created + :model: The model that was used to generate the completion + :object: The object type, which will be "text_completion" + """ + + id: str + choices: List[OpenAICompletionChoice] + created: int + model: str + object: Literal["text_completion"] = "text_completion" + + class ModelStore(Protocol): async def get_model(self, identifier: str) -> Model: ... @@ -564,3 +775,105 @@ class Inference(Protocol): :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id} """ ... + + @webmethod(route="/openai/v1/completions", method="POST") + async def openai_completion( + self, + # Standard OpenAI completion parameters + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + # vLLM-specific parameters + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + """Generate an OpenAI-compatible completion for the given prompt using the specified model. + + :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. + :param prompt: The prompt to generate a completion for + :param best_of: (Optional) The number of completions to generate + :param echo: (Optional) Whether to echo the prompt + :param frequency_penalty: (Optional) The penalty for repeated tokens + :param logit_bias: (Optional) The logit bias to use + :param logprobs: (Optional) The log probabilities to use + :param max_tokens: (Optional) The maximum number of tokens to generate + :param n: (Optional) The number of completions to generate + :param presence_penalty: (Optional) The penalty for repeated tokens + :param seed: (Optional) The seed to use + :param stop: (Optional) The stop tokens to use + :param stream: (Optional) Whether to stream the response + :param stream_options: (Optional) The stream options to use + :param temperature: (Optional) The temperature to use + :param top_p: (Optional) The top p to use + :param user: (Optional) The user to use + """ + ... + + @webmethod(route="/openai/v1/chat/completions", method="POST") + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + """Generate an OpenAI-compatible chat completion for the given messages using the specified model. + + :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. + :param messages: List of messages in the conversation + :param frequency_penalty: (Optional) The penalty for repeated tokens + :param function_call: (Optional) The function call to use + :param functions: (Optional) List of functions to use + :param logit_bias: (Optional) The logit bias to use + :param logprobs: (Optional) The log probabilities to use + :param max_completion_tokens: (Optional) The maximum number of tokens to generate + :param max_tokens: (Optional) The maximum number of tokens to generate + :param n: (Optional) The number of completions to generate + :param parallel_tool_calls: (Optional) Whether to parallelize tool calls + :param presence_penalty: (Optional) The penalty for repeated tokens + :param response_format: (Optional) The response format to use + :param seed: (Optional) The seed to use + :param stop: (Optional) The stop tokens to use + :param stream: (Optional) Whether to stream the response + :param stream_options: (Optional) The stream options to use + :param temperature: (Optional) The temperature to use + :param tool_choice: (Optional) The tool choice to use + :param tools: (Optional) The tools to use + :param top_logprobs: (Optional) The top log probabilities to use + :param top_p: (Optional) The top p to use + :param user: (Optional) The user to use + """ + ... diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 893ebc179..97398ce75 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -56,12 +56,35 @@ class ListModelsResponse(BaseModel): data: List[Model] +@json_schema_type +class OpenAIModel(BaseModel): + """A model from OpenAI. + + :id: The ID of the model + :object: The object type, which will be "model" + :created: The Unix timestamp in seconds when the model was created + :owned_by: The owner of the model + """ + + id: str + object: Literal["model"] = "model" + created: int + owned_by: str + + +class OpenAIListModelsResponse(BaseModel): + data: List[OpenAIModel] + + @runtime_checkable @trace_protocol class Models(Protocol): @webmethod(route="/models", method="GET") async def list_models(self) -> ListModelsResponse: ... + @webmethod(route="/openai/v1/models", method="GET") + async def openai_list_models(self) -> OpenAIListModelsResponse: ... + @webmethod(route="/models/{model_id:path}", method="GET") async def get_model( self, diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index eed96a40a..bc313036f 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -35,6 +35,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.apis.models import Model, ModelType from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.scoring import ( @@ -419,6 +420,126 @@ class InferenceRouter(Inference): task_type=task_type, ) + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + logger.debug( + f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", + ) + model_obj = await self.routing_table.get_model(model) + if model_obj is None: + raise ValueError(f"Model '{model}' not found") + if model_obj.model_type == ModelType.embedding: + raise ValueError(f"Model '{model}' is an embedding model and does not support completions") + + params = dict( + model=model_obj.identifier, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + guided_choice=guided_choice, + prompt_logprobs=prompt_logprobs, + ) + + provider = self.routing_table.get_provider_impl(model_obj.identifier) + return await provider.openai_completion(**params) + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + logger.debug( + f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}", + ) + model_obj = await self.routing_table.get_model(model) + if model_obj is None: + raise ValueError(f"Model '{model}' not found") + if model_obj.model_type == ModelType.embedding: + raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions") + + params = dict( + model=model_obj.identifier, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + + provider = self.routing_table.get_provider_impl(model_obj.identifier) + return await provider.openai_chat_completion(**params) + class SafetyRouter(Safety): def __init__( diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index f6adae49d..18b0c891f 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -5,6 +5,7 @@ # the root directory of this source tree. import logging +import time import uuid from typing import Any, Dict, List, Optional @@ -23,7 +24,7 @@ from llama_stack.apis.datasets import ( RowsDataSource, URIDataSource, ) -from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType +from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel from llama_stack.apis.resource import ResourceType from llama_stack.apis.scoring_functions import ( ListScoringFunctionsResponse, @@ -254,6 +255,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): async def list_models(self) -> ListModelsResponse: return ListModelsResponse(data=await self.get_all_with_type("model")) + async def openai_list_models(self) -> OpenAIListModelsResponse: + models = await self.get_all_with_type("model") + openai_models = [ + OpenAIModel( + id=model.identifier, + object="model", + created=int(time.time()), + owned_by="llama_stack", + ) + for model in models + ] + return OpenAIListModelsResponse(data=openai_models) + async def get_model(self, model_id: str) -> Model: model = await self.get_object_by_identifier("model", model_id) if model is None: diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 5f81d6421..3a7632065 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -54,6 +54,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, build_hf_repo_model_entry, ) +from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +) from llama_stack.providers.utils.inference.prompt_adapter import ( augment_content_with_response_format_prompt, chat_completion_request_to_messages, @@ -79,6 +83,8 @@ def llama4_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama class MetaReferenceInferenceImpl( + OpenAICompletionUnsupportedMixin, + OpenAIChatCompletionUnsupportedMixin, SentenceTransformerEmbeddingMixin, Inference, ModelsProtocolPrivate, diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 39847e085..9c370b6c5 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -23,6 +23,10 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate from llama_stack.providers.utils.inference.embedding_mixin import ( SentenceTransformerEmbeddingMixin, ) +from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +) from .config import SentenceTransformersInferenceConfig @@ -30,6 +34,8 @@ log = logging.getLogger(__name__) class SentenceTransformersInferenceImpl( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, SentenceTransformerEmbeddingMixin, Inference, ModelsProtocolPrivate, diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index ea2643b7a..085c79d6b 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -66,8 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelsProtocolPrivate, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + OpenAICompletionUnsupportedMixin, get_stop_reason, process_chat_completion_stream_response, ) @@ -172,7 +174,12 @@ def _convert_sampling_params( return vllm_sampling_params -class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): +class VLLMInferenceImpl( + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, + ModelsProtocolPrivate, +): """ vLLM-based inference model adapter for Llama Stack with support for multiple models. diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 120da5bd4..0a485da8f 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -36,8 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + OpenAICompletionUnsupportedMixin, get_sampling_strategy_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -51,7 +53,12 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( from .models import MODEL_ENTRIES -class BedrockInferenceAdapter(ModelRegistryHelper, Inference): +class BedrockInferenceAdapter( + ModelRegistryHelper, + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +): def __init__(self, config: BedrockConfig) -> None: ModelRegistryHelper.__init__(self, MODEL_ENTRIES) self._config = config diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index 43d986b86..5e0a5b484 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -49,7 +51,12 @@ from .config import CerebrasImplConfig from .models import MODEL_ENTRIES -class CerebrasInferenceAdapter(ModelRegistryHelper, Inference): +class CerebrasInferenceAdapter( + ModelRegistryHelper, + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +): def __init__(self, config: CerebrasImplConfig) -> None: ModelRegistryHelper.__init__( self, diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 0eaf0135b..a10878b27 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -34,6 +34,8 @@ from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -56,7 +58,12 @@ model_entries = [ ] -class DatabricksInferenceAdapter(ModelRegistryHelper, Inference): +class DatabricksInferenceAdapter( + ModelRegistryHelper, + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +): def __init__(self, config: DatabricksImplConfig) -> None: ModelRegistryHelper.__init__(self, model_entries=model_entries) self.config = config diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 4acbe43f8..b59e9f2cb 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -4,9 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union from fireworks.client import Fireworks +from openai import AsyncOpenAI from llama_stack.apis.common.content_types import ( InterleavedContent, @@ -31,6 +32,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ( @@ -39,6 +41,7 @@ from llama_stack.providers.utils.inference.model_registry import ( from llama_stack.providers.utils.inference.openai_compat import ( convert_message_to_openai_dict, get_sampling_options, + prepare_openai_completion_params, process_chat_completion_response, process_chat_completion_stream_response, process_completion_response, @@ -81,10 +84,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv ) return provider_data.fireworks_api_key + def _get_base_url(self) -> str: + return "https://api.fireworks.ai/inference/v1" + def _get_client(self) -> Fireworks: fireworks_api_key = self._get_api_key() return Fireworks(api_key=fireworks_api_key) + def _get_openai_client(self) -> AsyncOpenAI: + return AsyncOpenAI(base_url=self._get_base_url(), api_key=self._get_api_key()) + async def completion( self, model_id: str, @@ -268,3 +277,101 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv embeddings = [data.embedding for data in response.data] return EmbeddingsResponse(embeddings=embeddings) + + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + model_obj = await self.model_store.get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + ) + return await self._get_openai_client().completions.create(**params) + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self.model_store.get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + return await self._get_openai_client().chat.completions.create(**params) diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index e1f5d7a6a..d6f717719 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -7,7 +7,7 @@ import logging import warnings from functools import lru_cache -from typing import AsyncIterator, List, Optional, Union +from typing import Any, AsyncIterator, Dict, List, Optional, Union from openai import APIConnectionError, AsyncOpenAI, BadRequestError @@ -35,6 +35,7 @@ from llama_stack.apis.inference import ( ToolConfig, ToolDefinition, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.models.llama.datatypes import ToolPromptFormat from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, @@ -42,6 +43,7 @@ from llama_stack.providers.utils.inference.model_registry import ( from llama_stack.providers.utils.inference.openai_compat import ( convert_openai_chat_completion_choice, convert_openai_chat_completion_stream, + prepare_openai_completion_params, ) from llama_stack.providers.utils.inference.prompt_adapter import content_has_media @@ -263,3 +265,111 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): else: # we pass n=1 to get only one completion return convert_openai_chat_completion_choice(response.choices[0]) + + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + provider_model_id = self.get_provider_model_id(model) + + params = await prepare_openai_completion_params( + model=provider_model_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + ) + + try: + return await self._get_client(provider_model_id).completions.create(**params) + except APIConnectionError as e: + raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + provider_model_id = self.get_provider_model_id(model) + + params = await prepare_openai_completion_params( + model=provider_model_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + + try: + return await self._get_client(provider_model_id).chat.completions.create(**params) + except APIConnectionError as e: + raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 12902996b..b8671197e 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -5,10 +5,11 @@ # the root directory of this source tree. -from typing import Any, AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union import httpx from ollama import AsyncClient +from openai import AsyncOpenAI from llama_stack.apis.common.content_types import ( ImageContentItem, @@ -38,6 +39,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.apis.models import Model, ModelType from llama_stack.log import get_logger from llama_stack.providers.datatypes import ModelsProtocolPrivate @@ -67,7 +69,10 @@ from .models import model_entries logger = get_logger(name=__name__, category="inference") -class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): +class OllamaInferenceAdapter( + Inference, + ModelsProtocolPrivate, +): def __init__(self, url: str) -> None: self.register_helper = ModelRegistryHelper(model_entries) self.url = url @@ -76,6 +81,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): def client(self) -> AsyncClient: return AsyncClient(host=self.url) + @property + def openai_client(self) -> AsyncOpenAI: + return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama") + async def initialize(self) -> None: logger.info(f"checking connectivity to Ollama at `{self.url}`...") try: @@ -319,6 +328,115 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): return model + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + if not isinstance(prompt, str): + raise ValueError("Ollama does not support non-string prompts for completion") + + model_obj = await self._get_model(model) + params = { + k: v + for k, v in { + "model": model_obj.provider_resource_id, + "prompt": prompt, + "best_of": best_of, + "echo": echo, + "frequency_penalty": frequency_penalty, + "logit_bias": logit_bias, + "logprobs": logprobs, + "max_tokens": max_tokens, + "n": n, + "presence_penalty": presence_penalty, + "seed": seed, + "stop": stop, + "stream": stream, + "stream_options": stream_options, + "temperature": temperature, + "top_p": top_p, + "user": user, + }.items() + if v is not None + } + return await self.openai_client.completions.create(**params) # type: ignore + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self._get_model(model) + params = { + k: v + for k, v in { + "model": model_obj.provider_resource_id, + "messages": messages, + "frequency_penalty": frequency_penalty, + "function_call": function_call, + "functions": functions, + "logit_bias": logit_bias, + "logprobs": logprobs, + "max_completion_tokens": max_completion_tokens, + "max_tokens": max_tokens, + "n": n, + "parallel_tool_calls": parallel_tool_calls, + "presence_penalty": presence_penalty, + "response_format": response_format, + "seed": seed, + "stop": stop, + "stream": stream, + "stream_options": stream_options, + "temperature": temperature, + "tool_choice": tool_choice, + "tools": tools, + "top_logprobs": top_logprobs, + "top_p": top_p, + "user": user, + }.items() + if v is not None + } + return await self.openai_client.chat.completions.create(**params) # type: ignore + async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]: async def _convert_content(content) -> dict: diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index 96b2d73d8..0eb38c395 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional, Union from llama_stack_client import AsyncLlamaStackClient @@ -26,9 +26,11 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.apis.models import Model from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper +from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params from .config import PassthroughImplConfig @@ -201,6 +203,112 @@ class PassthroughInferenceAdapter(Inference): task_type=task_type, ) + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + client = self._get_client() + model_obj = await self.model_store.get_model(model) + + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + guided_choice=guided_choice, + prompt_logprobs=prompt_logprobs, + ) + + return await client.inference.openai_completion(**params) + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + client = self._get_client() + model_obj = await self.model_store.get_model(model) + + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + + return await client.inference.openai_chat_completion(**params) + def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]: json_params = {} for key, value in request_params.items(): diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 72f858cd8..878460122 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -12,6 +12,8 @@ from llama_stack.apis.inference import * # noqa: F403 # from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -38,7 +40,12 @@ RUNPOD_SUPPORTED_MODELS = { } -class RunpodInferenceAdapter(ModelRegistryHelper, Inference): +class RunpodInferenceAdapter( + ModelRegistryHelper, + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +): def __init__(self, config: RunpodImplConfig) -> None: ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS) self.config = config diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index a3badd468..c503657eb 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -42,6 +42,8 @@ from llama_stack.apis.inference import ( ) from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, process_chat_completion_stream_response, ) from llama_stack.providers.utils.inference.prompt_adapter import ( @@ -52,7 +54,12 @@ from .config import SambaNovaImplConfig from .models import MODEL_ENTRIES -class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference): +class SambaNovaInferenceAdapter( + ModelRegistryHelper, + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, +): def __init__(self, config: SambaNovaImplConfig) -> None: ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES) self.config = config diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index fe99fafe1..8f5b5e3cc 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -40,8 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) from llama_stack.providers.utils.inference.openai_compat import ( + OpenAIChatCompletionUnsupportedMixin, OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, + OpenAICompletionUnsupportedMixin, get_sampling_options, process_chat_completion_response, process_chat_completion_stream_response, @@ -69,7 +71,12 @@ def build_hf_repo_model_entries(): ] -class _HfAdapter(Inference, ModelsProtocolPrivate): +class _HfAdapter( + Inference, + OpenAIChatCompletionUnsupportedMixin, + OpenAICompletionUnsupportedMixin, + ModelsProtocolPrivate, +): client: AsyncInferenceClient max_tokens: int model_id: str diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index df7610935..1615b8cd1 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -4,8 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union +from openai import AsyncOpenAI from together import AsyncTogether from llama_stack.apis.common.content_types import ( @@ -30,12 +31,14 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( convert_message_to_openai_dict, get_sampling_options, + prepare_openai_completion_params, process_chat_completion_response, process_chat_completion_stream_response, process_completion_response, @@ -60,6 +63,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi ModelRegistryHelper.__init__(self, MODEL_ENTRIES) self.config = config self._client = None + self._openai_client = None async def initialize(self) -> None: pass @@ -110,6 +114,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi self._client = AsyncTogether(api_key=together_api_key) return self._client + def _get_openai_client(self) -> AsyncOpenAI: + if not self._openai_client: + together_client = self._get_client().client + self._openai_client = AsyncOpenAI( + base_url=together_client.base_url, + api_key=together_client.api_key, + ) + return self._openai_client + async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse: params = await self._get_params(request) client = self._get_client() @@ -243,3 +256,101 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi ) embeddings = [item.embedding for item in r.data] return EmbeddingsResponse(embeddings=embeddings) + + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + model_obj = await self.model_store.get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + ) + return await self._get_openai_client().completions.create(**params) # type: ignore + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self.model_store.get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + return await self._get_openai_client().chat.completions.create(**params) # type: ignore diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 6a828322f..79f92adce 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import json import logging -from typing import Any, AsyncGenerator, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, List, Optional, Union import httpx from openai import AsyncOpenAI @@ -45,6 +45,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.apis.models import Model, ModelType from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall from llama_stack.models.llama.sku_list import all_registered_models @@ -58,6 +59,7 @@ from llama_stack.providers.utils.inference.openai_compat import ( convert_message_to_openai_dict, convert_tool_call, get_sampling_options, + prepare_openai_completion_params, process_chat_completion_stream_response, process_completion_response, process_completion_stream_response, @@ -418,3 +420,109 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): embeddings = [data.embedding for data in response.data] return EmbeddingsResponse(embeddings=embeddings) + + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + model_obj = await self._get_model(model) + + extra_body: Dict[str, Any] = {} + if prompt_logprobs is not None and prompt_logprobs >= 0: + extra_body["prompt_logprobs"] = prompt_logprobs + if guided_choice: + extra_body["guided_choice"] = guided_choice + + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + extra_body=extra_body, + ) + return await self.client.completions.create(**params) # type: ignore + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self._get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + return await self.client.chat.completions.create(**params) # type: ignore diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py index bd1eb3978..2d2f0400a 100644 --- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py +++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import AsyncGenerator, AsyncIterator, List, Optional, Union +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union import litellm @@ -30,6 +30,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAIMessageParam from llama_stack.apis.models.models import Model from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.log import get_logger @@ -40,6 +41,7 @@ from llama_stack.providers.utils.inference.openai_compat import ( convert_openai_chat_completion_stream, convert_tooldef_to_openai_tool, get_sampling_options, + prepare_openai_completion_params, ) from llama_stack.providers.utils.inference.prompt_adapter import ( interleaved_content_as_str, @@ -245,3 +247,103 @@ class LiteLLMOpenAIMixin( embeddings = [data["embedding"] for data in response["data"]] return EmbeddingsResponse(embeddings=embeddings) + + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + model_obj = await self._get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + prompt=prompt, + best_of=best_of, + echo=echo, + frequency_penalty=frequency_penalty, + logit_bias=logit_bias, + logprobs=logprobs, + max_tokens=max_tokens, + n=n, + presence_penalty=presence_penalty, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + top_p=top_p, + user=user, + guided_choice=guided_choice, + prompt_logprobs=prompt_logprobs, + ) + return litellm.text_completion(**params) + + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIMessageParam], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + model_obj = await self._get_model(model) + params = await prepare_openai_completion_params( + model=model_obj.provider_resource_id, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + ) + return litellm.completion(**params) diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 0f3945b34..f33cb4443 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -5,8 +5,10 @@ # the root directory of this source tree. import json import logging +import time +import uuid import warnings -from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union +from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Union from openai import AsyncStream from openai.types.chat import ( @@ -83,6 +85,7 @@ from llama_stack.apis.inference import ( TopPSamplingStrategy, UserMessage, ) +from llama_stack.apis.inference.inference import OpenAIChatCompletion, OpenAICompletion, OpenAICompletionChoice from llama_stack.models.llama.datatypes import ( BuiltinTool, StopReason, @@ -843,6 +846,31 @@ def _convert_openai_logprobs( ] +def _convert_openai_sampling_params( + max_tokens: Optional[int] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, +) -> SamplingParams: + sampling_params = SamplingParams() + + if max_tokens: + sampling_params.max_tokens = max_tokens + + # Map an explicit temperature of 0 to greedy sampling + if temperature == 0: + strategy = GreedySamplingStrategy() + else: + # OpenAI defaults to 1.0 for temperature and top_p if unset + if temperature is None: + temperature = 1.0 + if top_p is None: + top_p = 1.0 + strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p) + + sampling_params.strategy = strategy + return sampling_params + + def convert_openai_chat_completion_choice( choice: OpenAIChoice, ) -> ChatCompletionResponse: @@ -1049,3 +1077,106 @@ async def convert_openai_chat_completion_stream( stop_reason=stop_reason, ) ) + + +async def prepare_openai_completion_params(**params): + completion_params = {k: v for k, v in params.items() if v is not None} + return completion_params + + +class OpenAICompletionUnsupportedMixin: + async def openai_completion( + self, + model: str, + prompt: Union[str, List[str], List[int], List[List[int]]], + best_of: Optional[int] = None, + echo: Optional[bool] = None, + frequency_penalty: Optional[float] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + presence_penalty: Optional[float] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + guided_choice: Optional[List[str]] = None, + prompt_logprobs: Optional[int] = None, + ) -> OpenAICompletion: + if stream: + raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions") + + # This is a pretty hacky way to do emulate completions - + # basically just de-batches them... + prompts = [prompt] if not isinstance(prompt, list) else prompt + + sampling_params = _convert_openai_sampling_params( + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + ) + + choices = [] + # "n" is the number of completions to generate per prompt + for _i in range(0, n): + # and we may have multiple prompts, if batching was used + + for prompt in prompts: + result = self.completion( + model_id=model, + content=prompt, + sampling_params=sampling_params, + ) + + index = len(choices) + text = result.content + finish_reason = _convert_openai_finish_reason(result.stop_reason) + + choice = OpenAICompletionChoice( + index=index, + text=text, + finish_reason=finish_reason, + ) + choices.append(choice) + + return OpenAICompletion( + id=f"cmpl-{uuid.uuid4()}", + choices=choices, + created=int(time.time()), + model=model, + object="text_completion", + ) + + +class OpenAIChatCompletionUnsupportedMixin: + async def openai_chat_completion( + self, + model: str, + messages: List[OpenAIChatCompletionMessage], + frequency_penalty: Optional[float] = None, + function_call: Optional[Union[str, Dict[str, Any]]] = None, + functions: Optional[List[Dict[str, Any]]] = None, + logit_bias: Optional[Dict[str, float]] = None, + logprobs: Optional[bool] = None, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + n: Optional[int] = None, + parallel_tool_calls: Optional[bool] = None, + presence_penalty: Optional[float] = None, + response_format: Optional[Dict[str, str]] = None, + seed: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + stream: Optional[bool] = None, + stream_options: Optional[Dict[str, Any]] = None, + temperature: Optional[float] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + top_logprobs: Optional[int] = None, + top_p: Optional[float] = None, + user: Optional[str] = None, + ) -> OpenAIChatCompletion: + raise ValueError(f"{self.__class__.__name__} doesn't support openai chat completion") diff --git a/pyproject.toml b/pyproject.toml index 83260b681..9ef3abe68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "jinja2>=3.1.6", "jsonschema", "llama-stack-client>=0.2.1", + "openai>=1.66", "prompt-toolkit", "python-dotenv", "pydantic>=2", diff --git a/requirements.txt b/requirements.txt index 6645e4e36..ef5782905 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ httpx==0.28.1 huggingface-hub==0.29.0 idna==3.10 jinja2==3.1.6 +jiter==0.8.2 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 llama-stack-client==0.2.1 @@ -27,6 +28,7 @@ markdown-it-py==3.0.0 markupsafe==3.0.2 mdurl==0.1.2 numpy==2.2.3 +openai==1.71.0 packaging==24.2 pandas==2.2.3 pillow==11.1.0 diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py new file mode 100644 index 000000000..0905d5817 --- /dev/null +++ b/tests/integration/inference/test_openai_completion.py @@ -0,0 +1,216 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +import pytest +from openai import OpenAI + +from llama_stack.distribution.library_client import LlamaStackAsLibraryClient + +from ..test_cases.test_case import TestCase + + +def provider_from_model(client_with_models, model_id): + models = {m.identifier: m for m in client_with_models.models.list()} + models.update({m.provider_resource_id: m for m in client_with_models.models.list()}) + provider_id = models[model_id].provider_id + providers = {p.provider_id: p for p in client_with_models.providers.list()} + return providers[provider_id] + + +def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id): + if isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("OpenAI completions are not supported when testing with library client yet.") + + provider = provider_from_model(client_with_models, model_id) + if provider.provider_type in ( + "inline::meta-reference", + "inline::sentence-transformers", + "inline::vllm", + "remote::bedrock", + "remote::cerebras", + "remote::databricks", + # Technically Nvidia does support OpenAI completions, but none of their hosted models + # support both completions and chat completions endpoint and all the Llama models are + # just chat completions + "remote::nvidia", + "remote::runpod", + "remote::sambanova", + "remote::tgi", + ): + pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") + + +def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id): + if isinstance(client_with_models, LlamaStackAsLibraryClient): + pytest.skip("OpenAI chat completions are not supported when testing with library client yet.") + + provider = provider_from_model(client_with_models, model_id) + if provider.provider_type in ( + "inline::meta-reference", + "inline::sentence-transformers", + "inline::vllm", + "remote::bedrock", + "remote::cerebras", + "remote::databricks", + "remote::runpod", + "remote::sambanova", + "remote::tgi", + ): + pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.") + + +def skip_if_provider_isnt_vllm(client_with_models, model_id): + provider = provider_from_model(client_with_models, model_id) + if provider.provider_type != "remote::vllm": + pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support vllm extra_body parameters.") + + +@pytest.fixture +def openai_client(client_with_models): + base_url = f"{client_with_models.base_url}/v1/openai/v1" + return OpenAI(base_url=base_url, api_key="bar") + + +@pytest.mark.parametrize( + "test_case", + [ + "inference:completion:sanity", + ], +) +def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case): + skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id) + tc = TestCase(test_case) + + # ollama needs more verbose prompting for some reason here... + prompt = "Respond to this question and explain your answer. " + tc["content"] + response = openai_client.completions.create( + model=text_model_id, + prompt=prompt, + stream=False, + ) + assert len(response.choices) > 0 + choice = response.choices[0] + assert len(choice.text) > 10 + + +@pytest.mark.parametrize( + "test_case", + [ + "inference:completion:sanity", + ], +) +def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case): + skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id) + tc = TestCase(test_case) + + # ollama needs more verbose prompting for some reason here... + prompt = "Respond to this question and explain your answer. " + tc["content"] + response = openai_client.completions.create( + model=text_model_id, + prompt=prompt, + stream=True, + max_tokens=50, + ) + streamed_content = [chunk.choices[0].text for chunk in response] + content_str = "".join(streamed_content).lower().strip() + assert len(content_str) > 10 + + +@pytest.mark.parametrize( + "prompt_logprobs", + [ + 1, + 0, + ], +) +def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs): + skip_if_provider_isnt_vllm(client_with_models, text_model_id) + + prompt = "Hello, world!" + response = openai_client.completions.create( + model=text_model_id, + prompt=prompt, + stream=False, + extra_body={ + "prompt_logprobs": prompt_logprobs, + }, + ) + assert len(response.choices) > 0 + choice = response.choices[0] + assert len(choice.prompt_logprobs) > 0 + + +def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id): + skip_if_provider_isnt_vllm(client_with_models, text_model_id) + + prompt = "I am feeling really sad today." + response = openai_client.completions.create( + model=text_model_id, + prompt=prompt, + stream=False, + extra_body={ + "guided_choice": ["joy", "sadness"], + }, + ) + assert len(response.choices) > 0 + choice = response.choices[0] + assert choice.text in ["joy", "sadness"] + + +@pytest.mark.parametrize( + "test_case", + [ + "inference:chat_completion:non_streaming_01", + "inference:chat_completion:non_streaming_02", + ], +) +def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case): + skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id) + tc = TestCase(test_case) + question = tc["question"] + expected = tc["expected"] + + response = openai_client.chat.completions.create( + model=text_model_id, + messages=[ + { + "role": "user", + "content": question, + } + ], + stream=False, + ) + message_content = response.choices[0].message.content.lower().strip() + assert len(message_content) > 0 + assert expected.lower() in message_content + + +@pytest.mark.parametrize( + "test_case", + [ + "inference:chat_completion:streaming_01", + "inference:chat_completion:streaming_02", + ], +) +def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case): + skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id) + tc = TestCase(test_case) + question = tc["question"] + expected = tc["expected"] + + response = openai_client.chat.completions.create( + model=text_model_id, + messages=[{"role": "user", "content": question}], + stream=True, + timeout=120, # Increase timeout to 2 minutes for large conversation history + ) + streamed_content = [] + for chunk in response: + if chunk.choices[0].delta.content: + streamed_content.append(chunk.choices[0].delta.content.lower().strip()) + assert len(streamed_content) > 0 + assert expected.lower() in "".join(streamed_content) diff --git a/uv.lock b/uv.lock index 1f7adea82..c6c9b1004 100644 --- a/uv.lock +++ b/uv.lock @@ -1384,6 +1384,7 @@ dependencies = [ { name = "jinja2" }, { name = "jsonschema" }, { name = "llama-stack-client" }, + { name = "openai" }, { name = "pillow" }, { name = "prompt-toolkit" }, { name = "pydantic" }, @@ -1485,6 +1486,7 @@ requires-dist = [ { name = "mcp", marker = "extra == 'test'" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, + { name = "openai", specifier = ">=1.66" }, { name = "openai", marker = "extra == 'test'" }, { name = "openai", marker = "extra == 'unit'" }, { name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" }, @@ -2016,7 +2018,7 @@ wheels = [ [[package]] name = "openai" -version = "1.63.2" +version = "1.71.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2028,9 +2030,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 } +sdist = { url = "https://files.pythonhosted.org/packages/d9/19/b8f0347090a649dce55a008ec54ac6abb50553a06508cdb5e7abb2813e99/openai-1.71.0.tar.gz", hash = "sha256:52b20bb990a1780f9b0b8ccebac93416343ebd3e4e714e3eff730336833ca207", size = 409926 } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 }, + { url = "https://files.pythonhosted.org/packages/c4/f7/049e85faf6a000890e5ca0edca8e9183f8a43c9e7bba869cad871da0caba/openai-1.71.0-py3-none-any.whl", hash = "sha256:e1c643738f1fff1af52bce6ef06a7716c95d089281e7011777179614f32937aa", size = 598975 }, ] [[package]]