[Feat] Expose Responses API on LiteLLM UI Test Key Page (#10166)

* add /responses API on UI

* add makeOpenAIResponsesRequest

* add makeOpenAIResponsesRequest

* fix add responses API on UI

* fix endpoint selector

* responses API render chunks on litellm chat ui

* fixes to streaming iterator

* fix render responses completed events

* fixes for MockResponsesAPIStreamingIterator

* transform_responses_api_request_to_chat_completion_request

* fix for responses API

* test_basic_openai_responses_api_streaming

* fix base responses api tests
This commit is contained in:
Ishaan Jaff 2025-04-19 13:18:54 -07:00 committed by GitHub
parent 03b5399f86
commit 0717369ae6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 332 additions and 52 deletions

View file

@ -7,15 +7,18 @@ from litellm.responses.litellm_completion_transformation.transformation import (
)
from litellm.responses.streaming_iterator import ResponsesAPIStreamingIterator
from litellm.types.llms.openai import (
OutputTextDeltaEvent,
ResponseCompletedEvent,
ResponseInputParam,
ResponsesAPIOptionalRequestParams,
ResponsesAPIStreamEvents,
ResponsesAPIStreamingResponse,
)
from litellm.types.utils import Delta as ChatCompletionDelta
from litellm.types.utils import (
ModelResponse,
ModelResponseStream,
StreamingChoices,
TextCompletionResponse,
)
@ -38,7 +41,7 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
self.responses_api_request: ResponsesAPIOptionalRequestParams = (
responses_api_request
)
self.collected_chunks: List[ModelResponseStream] = []
self.collected_chat_completion_chunks: List[ModelResponseStream] = []
self.finished: bool = False
async def __anext__(
@ -51,7 +54,14 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
# Get the next chunk from the stream
try:
chunk = await self.litellm_custom_stream_wrapper.__anext__()
self.collected_chunks.append(chunk)
self.collected_chat_completion_chunks.append(chunk)
response_api_chunk = (
self._transform_chat_completion_chunk_to_response_api_chunk(
chunk
)
)
if response_api_chunk:
return response_api_chunk
except StopAsyncIteration:
self.finished = True
response_completed_event = self._emit_response_completed_event()
@ -74,28 +84,65 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
try:
while True:
if self.finished is True:
raise StopAsyncIteration
raise StopIteration
# Get the next chunk from the stream
try:
chunk = self.litellm_custom_stream_wrapper.__next__()
self.collected_chunks.append(chunk)
except StopAsyncIteration:
self.collected_chat_completion_chunks.append(chunk)
response_api_chunk = (
self._transform_chat_completion_chunk_to_response_api_chunk(
chunk
)
)
if response_api_chunk:
return response_api_chunk
except StopIteration:
self.finished = True
response_completed_event = self._emit_response_completed_event()
if response_completed_event:
return response_completed_event
else:
raise StopAsyncIteration
raise StopIteration
except Exception as e:
# Handle HTTP errors
self.finished = True
raise e
def _transform_chat_completion_chunk_to_response_api_chunk(
self, chunk: ModelResponseStream
) -> Optional[ResponsesAPIStreamingResponse]:
"""
Transform a chat completion chunk to a response API chunk.
This currently only handles emitting the OutputTextDeltaEvent, which is used by other tools using the responses API.
"""
return OutputTextDeltaEvent(
type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA,
item_id=chunk.id,
output_index=0,
content_index=0,
delta=self._get_delta_string_from_streaming_choices(chunk.choices),
)
def _get_delta_string_from_streaming_choices(
self, choices: List[StreamingChoices]
) -> str:
"""
Get the delta string from the streaming choices
For now this collected the first choice's delta string.
It's unclear how users expect litellm to translate multiple-choices-per-chunk to the responses API output.
"""
choice = choices[0]
chat_completion_delta: ChatCompletionDelta = choice.delta
return chat_completion_delta.content or ""
def _emit_response_completed_event(self) -> Optional[ResponseCompletedEvent]:
litellm_model_response: Optional[
Union[ModelResponse, TextCompletionResponse]
] = stream_chunk_builder(chunks=self.collected_chunks)
] = stream_chunk_builder(chunks=self.collected_chat_completion_chunks)
if litellm_model_response and isinstance(litellm_model_response, ModelResponse):
return ResponseCompletedEvent(

View file

@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Union
from openai.types.responses.tool_param import FunctionToolParam
from litellm.caching import InMemoryCache
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.responses.litellm_completion_transformation.session_handler import (
ResponsesAPISessionElement,
SessionHandler,
@ -88,6 +89,18 @@ class LiteLLMCompletionResponsesConfig:
"custom_llm_provider": custom_llm_provider,
}
# Responses API `Completed` events require usage, we pass `stream_options` to litellm.completion to include usage
if stream is True:
stream_options = {
"include_usage": True,
}
litellm_completion_request["stream_options"] = stream_options
litellm_logging_obj: Optional[LiteLLMLoggingObj] = kwargs.get(
"litellm_logging_obj"
)
if litellm_logging_obj:
litellm_logging_obj.stream_options = stream_options
# only pass non-None values
litellm_completion_request = {
k: v for k, v in litellm_completion_request.items() if v is not None

View file

@ -11,7 +11,9 @@ from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
from litellm.types.llms.openai import (
OutputTextDeltaEvent,
ResponseCompletedEvent,
ResponsesAPIResponse,
ResponsesAPIStreamEvents,
ResponsesAPIStreamingResponse,
)
@ -212,9 +214,14 @@ class SyncResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
class MockResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
"""
mock iterator - some models like o1-pro do not support streaming, we need to fake a stream
Mock iteratorfake a stream by slicing the full response text into
5 char deltas, then emit a completed event.
Models like o1-pro don't support streaming, so we fake it.
"""
CHUNK_SIZE = 5
def __init__(
self,
response: httpx.Response,
@ -222,49 +229,68 @@ class MockResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator):
responses_api_provider_config: BaseResponsesAPIConfig,
logging_obj: LiteLLMLoggingObj,
):
self.raw_http_response = response
super().__init__(
response=response,
model=model,
responses_api_provider_config=responses_api_provider_config,
logging_obj=logging_obj,
)
self.is_done = False
# one-time transform
transformed = (
self.responses_api_provider_config.transform_response_api_response(
model=self.model,
raw_response=response,
logging_obj=logging_obj,
)
)
full_text = self._collect_text(transformed)
# build a list of 5char delta events
deltas = [
OutputTextDeltaEvent(
type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA,
delta=full_text[i : i + self.CHUNK_SIZE],
item_id=transformed.id,
output_index=0,
content_index=0,
)
for i in range(0, len(full_text), self.CHUNK_SIZE)
]
# append the completed event
self._events = deltas + [
ResponseCompletedEvent(
type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED,
response=transformed,
)
]
self._idx = 0
def __aiter__(self):
return self
async def __anext__(self) -> ResponsesAPIStreamingResponse:
if self.is_done:
if self._idx >= len(self._events):
raise StopAsyncIteration
self.is_done = True
transformed_response = (
self.responses_api_provider_config.transform_response_api_response(
model=self.model,
raw_response=self.raw_http_response,
logging_obj=self.logging_obj,
)
)
return ResponseCompletedEvent(
type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED,
response=transformed_response,
)
evt = self._events[self._idx]
self._idx += 1
return evt
def __iter__(self):
return self
def __next__(self) -> ResponsesAPIStreamingResponse:
if self.is_done:
if self._idx >= len(self._events):
raise StopIteration
self.is_done = True
transformed_response = (
self.responses_api_provider_config.transform_response_api_response(
model=self.model,
raw_response=self.raw_http_response,
logging_obj=self.logging_obj,
)
)
return ResponseCompletedEvent(
type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED,
response=transformed_response,
)
evt = self._events[self._idx]
self._idx += 1
return evt
def _collect_text(self, resp: ResponsesAPIResponse) -> str:
out = ""
for out_item in resp.output:
if out_item.type == "message":
for c in getattr(out_item, "content", []):
out += c.text
return out

View file

@ -133,11 +133,13 @@ class BaseResponsesAPITest(ABC):
validate_responses_api_response(response, final_chunk=True)
@pytest.mark.parametrize("sync_mode", [True])
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_basic_openai_responses_api_streaming(self, sync_mode):
litellm._turn_on_debug()
base_completion_call_args = self.get_base_completion_call_args()
collected_content_string = ""
response_completed_event = None
if sync_mode:
response = litellm.responses(
input="Basic ping",
@ -146,6 +148,10 @@ class BaseResponsesAPITest(ABC):
)
for event in response:
print("litellm response=", json.dumps(event, indent=4, default=str))
if event.type == "response.output_text.delta":
collected_content_string += event.delta
elif event.type == "response.completed":
response_completed_event = event
else:
response = await litellm.aresponses(
input="Basic ping",
@ -154,5 +160,35 @@ class BaseResponsesAPITest(ABC):
)
async for event in response:
print("litellm response=", json.dumps(event, indent=4, default=str))
if event.type == "response.output_text.delta":
collected_content_string += event.delta
elif event.type == "response.completed":
response_completed_event = event
# assert the delta chunks content had len(collected_content_string) > 0
# this content is typically rendered on chat ui's
assert len(collected_content_string) > 0
# assert the response completed event is not None
assert response_completed_event is not None
# assert the response completed event has a response
assert response_completed_event.response is not None
# assert the response completed event includes the usage
assert response_completed_event.response.usage is not None
# basic test assert the usage seems reasonable
print("response_completed_event.response.usage=", response_completed_event.response.usage)
assert response_completed_event.response.usage.input_tokens > 0 and response_completed_event.response.usage.input_tokens < 100
assert response_completed_event.response.usage.output_tokens > 0 and response_completed_event.response.usage.output_tokens < 1000
assert response_completed_event.response.usage.total_tokens > 0 and response_completed_event.response.usage.total_tokens < 1000
# total tokens should be the sum of input and output tokens
assert response_completed_event.response.usage.total_tokens == response_completed_event.response.usage.input_tokens + response_completed_event.response.usage.output_tokens

View file

@ -26,6 +26,7 @@ import {
import { message, Select, Spin, Typography, Tooltip, Input } from "antd";
import { makeOpenAIChatCompletionRequest } from "./chat_ui/llm_calls/chat_completion";
import { makeOpenAIImageGenerationRequest } from "./chat_ui/llm_calls/image_generation";
import { makeOpenAIResponsesRequest } from "./chat_ui/llm_calls/responses_api";
import { fetchAvailableModels, ModelGroup } from "./chat_ui/llm_calls/fetch_models";
import { litellmModeMapping, ModelMode, EndpointType, getEndpointType } from "./chat_ui/mode_endpoint_mapping";
import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
@ -137,20 +138,28 @@ const ChatUI: React.FC<ChatUIProps> = ({
}, [chatHistory]);
const updateTextUI = (role: string, chunk: string, model?: string) => {
setChatHistory((prevHistory) => {
const lastMessage = prevHistory[prevHistory.length - 1];
if (lastMessage && lastMessage.role === role && !lastMessage.isImage) {
console.log("updateTextUI called with:", role, chunk, model);
setChatHistory((prev) => {
const last = prev[prev.length - 1];
// if the last message is already from this same role, append
if (last && last.role === role && !last.isImage) {
// build a new object, but only set `model` if it wasn't there already
const updated: MessageType = {
...last,
content: last.content + chunk,
model: last.model ?? model, // ← only use the passedin model on the first chunk
};
return [...prev.slice(0, -1), updated];
} else {
// otherwise start a brand new assistant bubble
return [
...prevHistory.slice(0, prevHistory.length - 1),
{
...lastMessage,
content: lastMessage.content + chunk,
model
...prev,
{
role,
content: chunk,
model, // model set exactly once here
},
];
} else {
return [...prevHistory, { role, content: chunk, model }];
}
});
};
@ -297,7 +306,6 @@ const ChatUI: React.FC<ChatUIProps> = ({
try {
if (selectedModel) {
// Use EndpointType enum for comparison
if (endpointType === EndpointType.CHAT) {
// Create chat history for API call - strip out model field and isImage field
const apiChatHistory = [...chatHistory.filter(msg => !msg.isImage).map(({ role, content }) => ({ role, content })), newUserMessage];
@ -323,6 +331,21 @@ const ChatUI: React.FC<ChatUIProps> = ({
selectedTags,
signal
);
} else if (endpointType === EndpointType.RESPONSES) {
// Create chat history for API call - strip out model field and isImage field
const apiChatHistory = [...chatHistory.filter(msg => !msg.isImage).map(({ role, content }) => ({ role, content })), newUserMessage];
await makeOpenAIResponsesRequest(
apiChatHistory,
(role, delta, model) => updateTextUI(role, delta, model),
selectedModel,
effectiveApiKey,
selectedTags,
signal,
updateReasoningContent,
updateTimingData,
updateUsageData
);
}
}
} catch (error) {
@ -592,7 +615,7 @@ const ChatUI: React.FC<ChatUIProps> = ({
onChange={(e) => setInputMessage(e.target.value)}
onKeyDown={handleKeyDown}
placeholder={
endpointType === EndpointType.CHAT
endpointType === EndpointType.CHAT || endpointType === EndpointType.RESPONSES
? "Type your message... (Shift+Enter for new line)"
: "Describe the image you want to generate..."
}

View file

@ -19,8 +19,9 @@ const EndpointSelector: React.FC<EndpointSelectorProps> = ({
}) => {
// Map endpoint types to their display labels
const endpointOptions = [
{ value: EndpointType.CHAT, label: '/chat/completions' },
{ value: EndpointType.IMAGE, label: '/images/generations' }
{ value: EndpointType.CHAT, label: '/v1/chat/completions' },
{ value: EndpointType.RESPONSES, label: '/v1/responses' },
{ value: EndpointType.IMAGE, label: '/v1/images/generations' },
];
return (

View file

@ -0,0 +1,131 @@
import openai from "openai";
import { message } from "antd";
import { MessageType } from "../types";
import { TokenUsage } from "../ResponseMetrics";
export async function makeOpenAIResponsesRequest(
messages: MessageType[],
updateTextUI: (role: string, delta: string, model?: string) => void,
selectedModel: string,
accessToken: string | null,
tags: string[] = [],
signal?: AbortSignal,
onReasoningContent?: (content: string) => void,
onTimingData?: (timeToFirstToken: number) => void,
onUsageData?: (usage: TokenUsage) => void
) {
if (!accessToken) {
throw new Error("API key is required");
}
// Base URL should be the current base_url
const isLocal = process.env.NODE_ENV === "development";
if (isLocal !== true) {
console.log = function () {};
}
const proxyBaseUrl = isLocal
? "http://localhost:4000"
: window.location.origin;
const client = new openai.OpenAI({
apiKey: accessToken,
baseURL: proxyBaseUrl,
dangerouslyAllowBrowser: true,
defaultHeaders: tags && tags.length > 0 ? { 'x-litellm-tags': tags.join(',') } : undefined,
});
try {
const startTime = Date.now();
let firstTokenReceived = false;
// Format messages for the API
const formattedInput = messages.map(message => ({
role: message.role,
content: message.content,
type: "message"
}));
// Create request to OpenAI responses API
// Use 'any' type to avoid TypeScript issues with the experimental API
const response = await (client as any).responses.create({
model: selectedModel,
input: formattedInput,
stream: true,
}, { signal });
for await (const event of response) {
console.log("Response event:", event);
// Use a type-safe approach to handle events
if (typeof event === 'object' && event !== null) {
// Handle output text delta
// 1) drop any “role” streams
if (event.type === "response.role.delta") {
continue;
}
// 2) only handle actual text deltas
if (event.type === "response.output_text.delta" && typeof event.delta === "string") {
const delta = event.delta;
console.log("Text delta", delta);
// skip pure whitespace/newlines
if (delta.trim().length > 0) {
updateTextUI("assistant", delta, selectedModel);
// Calculate time to first token
if (!firstTokenReceived) {
firstTokenReceived = true;
const timeToFirstToken = Date.now() - startTime;
console.log("First token received! Time:", timeToFirstToken, "ms");
if (onTimingData) {
onTimingData(timeToFirstToken);
}
}
}
}
// Handle reasoning content
if (event.type === "response.reasoning.delta" && 'delta' in event) {
const delta = event.delta;
if (typeof delta === 'string' && onReasoningContent) {
onReasoningContent(delta);
}
}
// Handle usage data at the response.completed event
if (event.type === "response.completed" && 'response' in event) {
const response_obj = event.response;
const usage = response_obj.usage;
console.log("Usage data:", usage);
if (usage && onUsageData) {
console.log("Usage data:", usage);
// Extract usage data safely
const usageData: TokenUsage = {
completionTokens: usage.output_tokens,
promptTokens: usage.input_tokens,
totalTokens: usage.total_tokens
};
// Add reasoning tokens if available
if (usage.completion_tokens_details?.reasoning_tokens) {
usageData.reasoningTokens = usage.completion_tokens_details.reasoning_tokens;
}
onUsageData(usageData);
}
}
}
}
} catch (error) {
if (signal?.aborted) {
console.log("Responses API request was cancelled");
} else {
message.error(`Error occurred while generating model response. Please try again. Error: ${error}`, 20);
}
throw error; // Re-throw to allow the caller to handle the error
}
}

View file

@ -4,6 +4,7 @@
export enum ModelMode {
IMAGE_GENERATION = "image_generation",
CHAT = "chat",
RESPONSES = "responses",
// add additional modes as needed
}
@ -11,6 +12,7 @@ export enum ModelMode {
export enum EndpointType {
IMAGE = "image",
CHAT = "chat",
RESPONSES = "responses",
// add additional endpoint types if required
}
@ -18,6 +20,7 @@ export enum ModelMode {
export const litellmModeMapping: Record<ModelMode, EndpointType> = {
[ModelMode.IMAGE_GENERATION]: EndpointType.IMAGE,
[ModelMode.CHAT]: EndpointType.CHAT,
[ModelMode.RESPONSES]: EndpointType.RESPONSES,
};
export const getEndpointType = (mode: string): EndpointType => {