diff --git a/litellm/responses/litellm_completion_transformation/streaming_iterator.py b/litellm/responses/litellm_completion_transformation/streaming_iterator.py index d970746f89..6f2d5bc185 100644 --- a/litellm/responses/litellm_completion_transformation/streaming_iterator.py +++ b/litellm/responses/litellm_completion_transformation/streaming_iterator.py @@ -7,15 +7,18 @@ from litellm.responses.litellm_completion_transformation.transformation import ( ) from litellm.responses.streaming_iterator import ResponsesAPIStreamingIterator from litellm.types.llms.openai import ( + OutputTextDeltaEvent, ResponseCompletedEvent, ResponseInputParam, ResponsesAPIOptionalRequestParams, ResponsesAPIStreamEvents, ResponsesAPIStreamingResponse, ) +from litellm.types.utils import Delta as ChatCompletionDelta from litellm.types.utils import ( ModelResponse, ModelResponseStream, + StreamingChoices, TextCompletionResponse, ) @@ -38,7 +41,7 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): self.responses_api_request: ResponsesAPIOptionalRequestParams = ( responses_api_request ) - self.collected_chunks: List[ModelResponseStream] = [] + self.collected_chat_completion_chunks: List[ModelResponseStream] = [] self.finished: bool = False async def __anext__( @@ -51,7 +54,14 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): # Get the next chunk from the stream try: chunk = await self.litellm_custom_stream_wrapper.__anext__() - self.collected_chunks.append(chunk) + self.collected_chat_completion_chunks.append(chunk) + response_api_chunk = ( + self._transform_chat_completion_chunk_to_response_api_chunk( + chunk + ) + ) + if response_api_chunk: + return response_api_chunk except StopAsyncIteration: self.finished = True response_completed_event = self._emit_response_completed_event() @@ -74,28 +84,65 @@ class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator): try: while True: if self.finished is True: - raise StopAsyncIteration + raise StopIteration # Get the next chunk from the stream try: chunk = self.litellm_custom_stream_wrapper.__next__() - self.collected_chunks.append(chunk) - except StopAsyncIteration: + self.collected_chat_completion_chunks.append(chunk) + response_api_chunk = ( + self._transform_chat_completion_chunk_to_response_api_chunk( + chunk + ) + ) + if response_api_chunk: + return response_api_chunk + except StopIteration: self.finished = True response_completed_event = self._emit_response_completed_event() if response_completed_event: return response_completed_event else: - raise StopAsyncIteration + raise StopIteration except Exception as e: # Handle HTTP errors self.finished = True raise e + def _transform_chat_completion_chunk_to_response_api_chunk( + self, chunk: ModelResponseStream + ) -> Optional[ResponsesAPIStreamingResponse]: + """ + Transform a chat completion chunk to a response API chunk. + + This currently only handles emitting the OutputTextDeltaEvent, which is used by other tools using the responses API. + """ + return OutputTextDeltaEvent( + type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA, + item_id=chunk.id, + output_index=0, + content_index=0, + delta=self._get_delta_string_from_streaming_choices(chunk.choices), + ) + + def _get_delta_string_from_streaming_choices( + self, choices: List[StreamingChoices] + ) -> str: + """ + Get the delta string from the streaming choices + + For now this collected the first choice's delta string. + + It's unclear how users expect litellm to translate multiple-choices-per-chunk to the responses API output. + """ + choice = choices[0] + chat_completion_delta: ChatCompletionDelta = choice.delta + return chat_completion_delta.content or "" + def _emit_response_completed_event(self) -> Optional[ResponseCompletedEvent]: litellm_model_response: Optional[ Union[ModelResponse, TextCompletionResponse] - ] = stream_chunk_builder(chunks=self.collected_chunks) + ] = stream_chunk_builder(chunks=self.collected_chat_completion_chunks) if litellm_model_response and isinstance(litellm_model_response, ModelResponse): return ResponseCompletedEvent( diff --git a/litellm/responses/litellm_completion_transformation/transformation.py b/litellm/responses/litellm_completion_transformation/transformation.py index b1e52eb8f3..c00d6622bf 100644 --- a/litellm/responses/litellm_completion_transformation/transformation.py +++ b/litellm/responses/litellm_completion_transformation/transformation.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Union from openai.types.responses.tool_param import FunctionToolParam from litellm.caching import InMemoryCache +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.responses.litellm_completion_transformation.session_handler import ( ResponsesAPISessionElement, SessionHandler, @@ -88,6 +89,18 @@ class LiteLLMCompletionResponsesConfig: "custom_llm_provider": custom_llm_provider, } + # Responses API `Completed` events require usage, we pass `stream_options` to litellm.completion to include usage + if stream is True: + stream_options = { + "include_usage": True, + } + litellm_completion_request["stream_options"] = stream_options + litellm_logging_obj: Optional[LiteLLMLoggingObj] = kwargs.get( + "litellm_logging_obj" + ) + if litellm_logging_obj: + litellm_logging_obj.stream_options = stream_options + # only pass non-None values litellm_completion_request = { k: v for k, v in litellm_completion_request.items() if v is not None diff --git a/litellm/responses/streaming_iterator.py b/litellm/responses/streaming_iterator.py index 3039efb9f7..e050c47080 100644 --- a/litellm/responses/streaming_iterator.py +++ b/litellm/responses/streaming_iterator.py @@ -11,7 +11,9 @@ from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig from litellm.types.llms.openai import ( + OutputTextDeltaEvent, ResponseCompletedEvent, + ResponsesAPIResponse, ResponsesAPIStreamEvents, ResponsesAPIStreamingResponse, ) @@ -212,9 +214,14 @@ class SyncResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator): class MockResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator): """ - mock iterator - some models like o1-pro do not support streaming, we need to fake a stream + Mock iterator—fake a stream by slicing the full response text into + 5 char deltas, then emit a completed event. + + Models like o1-pro don't support streaming, so we fake it. """ + CHUNK_SIZE = 5 + def __init__( self, response: httpx.Response, @@ -222,49 +229,68 @@ class MockResponsesAPIStreamingIterator(BaseResponsesAPIStreamingIterator): responses_api_provider_config: BaseResponsesAPIConfig, logging_obj: LiteLLMLoggingObj, ): - self.raw_http_response = response super().__init__( response=response, model=model, responses_api_provider_config=responses_api_provider_config, logging_obj=logging_obj, ) - self.is_done = False + + # one-time transform + transformed = ( + self.responses_api_provider_config.transform_response_api_response( + model=self.model, + raw_response=response, + logging_obj=logging_obj, + ) + ) + full_text = self._collect_text(transformed) + + # build a list of 5‑char delta events + deltas = [ + OutputTextDeltaEvent( + type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA, + delta=full_text[i : i + self.CHUNK_SIZE], + item_id=transformed.id, + output_index=0, + content_index=0, + ) + for i in range(0, len(full_text), self.CHUNK_SIZE) + ] + + # append the completed event + self._events = deltas + [ + ResponseCompletedEvent( + type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED, + response=transformed, + ) + ] + self._idx = 0 def __aiter__(self): return self async def __anext__(self) -> ResponsesAPIStreamingResponse: - if self.is_done: + if self._idx >= len(self._events): raise StopAsyncIteration - self.is_done = True - transformed_response = ( - self.responses_api_provider_config.transform_response_api_response( - model=self.model, - raw_response=self.raw_http_response, - logging_obj=self.logging_obj, - ) - ) - return ResponseCompletedEvent( - type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED, - response=transformed_response, - ) + evt = self._events[self._idx] + self._idx += 1 + return evt def __iter__(self): return self def __next__(self) -> ResponsesAPIStreamingResponse: - if self.is_done: + if self._idx >= len(self._events): raise StopIteration - self.is_done = True - transformed_response = ( - self.responses_api_provider_config.transform_response_api_response( - model=self.model, - raw_response=self.raw_http_response, - logging_obj=self.logging_obj, - ) - ) - return ResponseCompletedEvent( - type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED, - response=transformed_response, - ) + evt = self._events[self._idx] + self._idx += 1 + return evt + + def _collect_text(self, resp: ResponsesAPIResponse) -> str: + out = "" + for out_item in resp.output: + if out_item.type == "message": + for c in getattr(out_item, "content", []): + out += c.text + return out diff --git a/tests/llm_responses_api_testing/base_responses_api.py b/tests/llm_responses_api_testing/base_responses_api.py index 884d9bda7b..fd39c13604 100644 --- a/tests/llm_responses_api_testing/base_responses_api.py +++ b/tests/llm_responses_api_testing/base_responses_api.py @@ -133,11 +133,13 @@ class BaseResponsesAPITest(ABC): validate_responses_api_response(response, final_chunk=True) - @pytest.mark.parametrize("sync_mode", [True]) + @pytest.mark.parametrize("sync_mode", [True, False]) @pytest.mark.asyncio async def test_basic_openai_responses_api_streaming(self, sync_mode): litellm._turn_on_debug() base_completion_call_args = self.get_base_completion_call_args() + collected_content_string = "" + response_completed_event = None if sync_mode: response = litellm.responses( input="Basic ping", @@ -146,6 +148,10 @@ class BaseResponsesAPITest(ABC): ) for event in response: print("litellm response=", json.dumps(event, indent=4, default=str)) + if event.type == "response.output_text.delta": + collected_content_string += event.delta + elif event.type == "response.completed": + response_completed_event = event else: response = await litellm.aresponses( input="Basic ping", @@ -154,5 +160,35 @@ class BaseResponsesAPITest(ABC): ) async for event in response: print("litellm response=", json.dumps(event, indent=4, default=str)) + if event.type == "response.output_text.delta": + collected_content_string += event.delta + elif event.type == "response.completed": + response_completed_event = event + + # assert the delta chunks content had len(collected_content_string) > 0 + # this content is typically rendered on chat ui's + assert len(collected_content_string) > 0 + + # assert the response completed event is not None + assert response_completed_event is not None + + # assert the response completed event has a response + assert response_completed_event.response is not None + + # assert the response completed event includes the usage + assert response_completed_event.response.usage is not None + + # basic test assert the usage seems reasonable + print("response_completed_event.response.usage=", response_completed_event.response.usage) + assert response_completed_event.response.usage.input_tokens > 0 and response_completed_event.response.usage.input_tokens < 100 + assert response_completed_event.response.usage.output_tokens > 0 and response_completed_event.response.usage.output_tokens < 1000 + assert response_completed_event.response.usage.total_tokens > 0 and response_completed_event.response.usage.total_tokens < 1000 + + # total tokens should be the sum of input and output tokens + assert response_completed_event.response.usage.total_tokens == response_completed_event.response.usage.input_tokens + response_completed_event.response.usage.output_tokens + + + + diff --git a/ui/litellm-dashboard/src/components/chat_ui.tsx b/ui/litellm-dashboard/src/components/chat_ui.tsx index ae8d15cfe1..6f9801c632 100644 --- a/ui/litellm-dashboard/src/components/chat_ui.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui.tsx @@ -26,6 +26,7 @@ import { import { message, Select, Spin, Typography, Tooltip, Input } from "antd"; import { makeOpenAIChatCompletionRequest } from "./chat_ui/llm_calls/chat_completion"; import { makeOpenAIImageGenerationRequest } from "./chat_ui/llm_calls/image_generation"; +import { makeOpenAIResponsesRequest } from "./chat_ui/llm_calls/responses_api"; import { fetchAvailableModels, ModelGroup } from "./chat_ui/llm_calls/fetch_models"; import { litellmModeMapping, ModelMode, EndpointType, getEndpointType } from "./chat_ui/mode_endpoint_mapping"; import { Prism as SyntaxHighlighter } from "react-syntax-highlighter"; @@ -137,20 +138,28 @@ const ChatUI: React.FC = ({ }, [chatHistory]); const updateTextUI = (role: string, chunk: string, model?: string) => { - setChatHistory((prevHistory) => { - const lastMessage = prevHistory[prevHistory.length - 1]; - - if (lastMessage && lastMessage.role === role && !lastMessage.isImage) { + console.log("updateTextUI called with:", role, chunk, model); + setChatHistory((prev) => { + const last = prev[prev.length - 1]; + // if the last message is already from this same role, append + if (last && last.role === role && !last.isImage) { + // build a new object, but only set `model` if it wasn't there already + const updated: MessageType = { + ...last, + content: last.content + chunk, + model: last.model ?? model, // ← only use the passed‐in model on the first chunk + }; + return [...prev.slice(0, -1), updated]; + } else { + // otherwise start a brand new assistant bubble return [ - ...prevHistory.slice(0, prevHistory.length - 1), - { - ...lastMessage, - content: lastMessage.content + chunk, - model + ...prev, + { + role, + content: chunk, + model, // model set exactly once here }, ]; - } else { - return [...prevHistory, { role, content: chunk, model }]; } }); }; @@ -297,7 +306,6 @@ const ChatUI: React.FC = ({ try { if (selectedModel) { - // Use EndpointType enum for comparison if (endpointType === EndpointType.CHAT) { // Create chat history for API call - strip out model field and isImage field const apiChatHistory = [...chatHistory.filter(msg => !msg.isImage).map(({ role, content }) => ({ role, content })), newUserMessage]; @@ -323,6 +331,21 @@ const ChatUI: React.FC = ({ selectedTags, signal ); + } else if (endpointType === EndpointType.RESPONSES) { + // Create chat history for API call - strip out model field and isImage field + const apiChatHistory = [...chatHistory.filter(msg => !msg.isImage).map(({ role, content }) => ({ role, content })), newUserMessage]; + + await makeOpenAIResponsesRequest( + apiChatHistory, + (role, delta, model) => updateTextUI(role, delta, model), + selectedModel, + effectiveApiKey, + selectedTags, + signal, + updateReasoningContent, + updateTimingData, + updateUsageData + ); } } } catch (error) { @@ -592,7 +615,7 @@ const ChatUI: React.FC = ({ onChange={(e) => setInputMessage(e.target.value)} onKeyDown={handleKeyDown} placeholder={ - endpointType === EndpointType.CHAT + endpointType === EndpointType.CHAT || endpointType === EndpointType.RESPONSES ? "Type your message... (Shift+Enter for new line)" : "Describe the image you want to generate..." } diff --git a/ui/litellm-dashboard/src/components/chat_ui/EndpointSelector.tsx b/ui/litellm-dashboard/src/components/chat_ui/EndpointSelector.tsx index 49b1df3e97..12d5acbc70 100644 --- a/ui/litellm-dashboard/src/components/chat_ui/EndpointSelector.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui/EndpointSelector.tsx @@ -19,8 +19,9 @@ const EndpointSelector: React.FC = ({ }) => { // Map endpoint types to their display labels const endpointOptions = [ - { value: EndpointType.CHAT, label: '/chat/completions' }, - { value: EndpointType.IMAGE, label: '/images/generations' } + { value: EndpointType.CHAT, label: '/v1/chat/completions' }, + { value: EndpointType.RESPONSES, label: '/v1/responses' }, + { value: EndpointType.IMAGE, label: '/v1/images/generations' }, ]; return ( diff --git a/ui/litellm-dashboard/src/components/chat_ui/llm_calls/responses_api.tsx b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/responses_api.tsx new file mode 100644 index 0000000000..744935159b --- /dev/null +++ b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/responses_api.tsx @@ -0,0 +1,131 @@ +import openai from "openai"; +import { message } from "antd"; +import { MessageType } from "../types"; +import { TokenUsage } from "../ResponseMetrics"; + +export async function makeOpenAIResponsesRequest( + messages: MessageType[], + updateTextUI: (role: string, delta: string, model?: string) => void, + selectedModel: string, + accessToken: string | null, + tags: string[] = [], + signal?: AbortSignal, + onReasoningContent?: (content: string) => void, + onTimingData?: (timeToFirstToken: number) => void, + onUsageData?: (usage: TokenUsage) => void +) { + if (!accessToken) { + throw new Error("API key is required"); + } + + // Base URL should be the current base_url + const isLocal = process.env.NODE_ENV === "development"; + if (isLocal !== true) { + console.log = function () {}; + } + + const proxyBaseUrl = isLocal + ? "http://localhost:4000" + : window.location.origin; + + const client = new openai.OpenAI({ + apiKey: accessToken, + baseURL: proxyBaseUrl, + dangerouslyAllowBrowser: true, + defaultHeaders: tags && tags.length > 0 ? { 'x-litellm-tags': tags.join(',') } : undefined, + }); + + try { + const startTime = Date.now(); + let firstTokenReceived = false; + + // Format messages for the API + const formattedInput = messages.map(message => ({ + role: message.role, + content: message.content, + type: "message" + })); + + // Create request to OpenAI responses API + // Use 'any' type to avoid TypeScript issues with the experimental API + const response = await (client as any).responses.create({ + model: selectedModel, + input: formattedInput, + stream: true, + }, { signal }); + + for await (const event of response) { + console.log("Response event:", event); + + // Use a type-safe approach to handle events + if (typeof event === 'object' && event !== null) { + // Handle output text delta + // 1) drop any “role” streams + if (event.type === "response.role.delta") { + continue; + } + + // 2) only handle actual text deltas + if (event.type === "response.output_text.delta" && typeof event.delta === "string") { + const delta = event.delta; + console.log("Text delta", delta); + // skip pure whitespace/newlines + if (delta.trim().length > 0) { + updateTextUI("assistant", delta, selectedModel); + + // Calculate time to first token + if (!firstTokenReceived) { + firstTokenReceived = true; + const timeToFirstToken = Date.now() - startTime; + console.log("First token received! Time:", timeToFirstToken, "ms"); + + if (onTimingData) { + onTimingData(timeToFirstToken); + } + } + + } + } + + // Handle reasoning content + if (event.type === "response.reasoning.delta" && 'delta' in event) { + const delta = event.delta; + if (typeof delta === 'string' && onReasoningContent) { + onReasoningContent(delta); + } + } + + // Handle usage data at the response.completed event + if (event.type === "response.completed" && 'response' in event) { + const response_obj = event.response; + const usage = response_obj.usage; + console.log("Usage data:", usage); + if (usage && onUsageData) { + console.log("Usage data:", usage); + + // Extract usage data safely + const usageData: TokenUsage = { + completionTokens: usage.output_tokens, + promptTokens: usage.input_tokens, + totalTokens: usage.total_tokens + }; + + // Add reasoning tokens if available + if (usage.completion_tokens_details?.reasoning_tokens) { + usageData.reasoningTokens = usage.completion_tokens_details.reasoning_tokens; + } + + onUsageData(usageData); + } + } + } + } + } catch (error) { + if (signal?.aborted) { + console.log("Responses API request was cancelled"); + } else { + message.error(`Error occurred while generating model response. Please try again. Error: ${error}`, 20); + } + throw error; // Re-throw to allow the caller to handle the error + } +} \ No newline at end of file diff --git a/ui/litellm-dashboard/src/components/chat_ui/mode_endpoint_mapping.tsx b/ui/litellm-dashboard/src/components/chat_ui/mode_endpoint_mapping.tsx index 0ed0098fac..ea86831842 100644 --- a/ui/litellm-dashboard/src/components/chat_ui/mode_endpoint_mapping.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui/mode_endpoint_mapping.tsx @@ -4,6 +4,7 @@ export enum ModelMode { IMAGE_GENERATION = "image_generation", CHAT = "chat", + RESPONSES = "responses", // add additional modes as needed } @@ -11,6 +12,7 @@ export enum ModelMode { export enum EndpointType { IMAGE = "image", CHAT = "chat", + RESPONSES = "responses", // add additional endpoint types if required } @@ -18,6 +20,7 @@ export enum ModelMode { export const litellmModeMapping: Record = { [ModelMode.IMAGE_GENERATION]: EndpointType.IMAGE, [ModelMode.CHAT]: EndpointType.CHAT, + [ModelMode.RESPONSES]: EndpointType.RESPONSES, }; export const getEndpointType = (mode: string): EndpointType => {