diff --git a/ui/litellm-dashboard/src/components/chat_ui.tsx b/ui/litellm-dashboard/src/components/chat_ui.tsx index f7b5233485..3a6ea6b294 100644 --- a/ui/litellm-dashboard/src/components/chat_ui.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui.tsx @@ -35,6 +35,7 @@ import TagSelector from "./tag_management/TagSelector"; import { determineEndpointType } from "./chat_ui/EndpointUtils"; import { MessageType } from "./chat_ui/types"; import ReasoningContent from "./chat_ui/ReasoningContent"; +import ResponseMetrics, { TokenUsage } from "./chat_ui/ResponseMetrics"; import { SendOutlined, ApiOutlined, @@ -185,6 +186,47 @@ const ChatUI: React.FC = ({ }); }; + const updateTimingData = (timeToFirstToken: number) => { + setChatHistory((prevHistory) => { + const lastMessage = prevHistory[prevHistory.length - 1]; + + if (lastMessage && lastMessage.role === "assistant") { + return [ + ...prevHistory.slice(0, prevHistory.length - 1), + { + ...lastMessage, + timeToFirstToken + }, + ]; + } + + return prevHistory; + }); + }; + + const updateUsageData = (usage: TokenUsage) => { + console.log("Received usage data:", usage); + setChatHistory((prevHistory) => { + const lastMessage = prevHistory[prevHistory.length - 1]; + + if (lastMessage && lastMessage.role === "assistant") { + console.log("Updating message with usage data:", usage); + const updatedMessage = { + ...lastMessage, + usage + }; + console.log("Updated message:", updatedMessage); + + return [ + ...prevHistory.slice(0, prevHistory.length - 1), + updatedMessage + ]; + } + + return prevHistory; + }); + }; + const updateImageUI = (imageUrl: string, model: string) => { setChatHistory((prevHistory) => [ ...prevHistory, @@ -248,7 +290,9 @@ const ChatUI: React.FC = ({ effectiveApiKey, selectedTags, signal, - updateReasoningContent + updateReasoningContent, + updateTimingData, + updateUsageData ); } else if (endpointType === EndpointType.IMAGE) { // For image generation @@ -503,6 +547,13 @@ const ChatUI: React.FC = ({ {message.content} )} + + {message.role === "assistant" && (message.timeToFirstToken || message.usage) && ( + + )} diff --git a/ui/litellm-dashboard/src/components/chat_ui/ResponseMetrics.tsx b/ui/litellm-dashboard/src/components/chat_ui/ResponseMetrics.tsx new file mode 100644 index 0000000000..ca201f1988 --- /dev/null +++ b/ui/litellm-dashboard/src/components/chat_ui/ResponseMetrics.tsx @@ -0,0 +1,80 @@ +import React from "react"; +import { Tooltip } from "antd"; +import { + ClockCircleOutlined, + NumberOutlined, + ImportOutlined, + ExportOutlined, + ThunderboltOutlined, + BulbOutlined +} from "@ant-design/icons"; + +export interface TokenUsage { + completionTokens?: number; + promptTokens?: number; + totalTokens?: number; + reasoningTokens?: number; +} + +interface ResponseMetricsProps { + timeToFirstToken?: number; // in milliseconds + usage?: TokenUsage; +} + +const ResponseMetrics: React.FC = ({ + timeToFirstToken, + usage +}) => { + if (!timeToFirstToken && !usage) return null; + + return ( +
+ {timeToFirstToken !== undefined && ( + +
+ + {(timeToFirstToken / 1000).toFixed(2)}s +
+
+ )} + + {usage?.promptTokens !== undefined && ( + +
+ + In: {usage.promptTokens} +
+
+ )} + + {usage?.completionTokens !== undefined && ( + +
+ + Out: {usage.completionTokens} +
+
+ )} + + {usage?.reasoningTokens !== undefined && ( + +
+ + Reasoning: {usage.reasoningTokens} +
+
+ )} + + {usage?.totalTokens !== undefined && ( + +
+ + Total: {usage.totalTokens} +
+
+ )} +
+ ); +}; + +export default ResponseMetrics; \ No newline at end of file diff --git a/ui/litellm-dashboard/src/components/chat_ui/llm_calls/chat_completion.tsx b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/chat_completion.tsx index 7d1bdab66f..8aaac4b2b6 100644 --- a/ui/litellm-dashboard/src/components/chat_ui/llm_calls/chat_completion.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/chat_completion.tsx @@ -1,7 +1,7 @@ import openai from "openai"; import { ChatCompletionMessageParam } from "openai/resources/chat/completions"; import { message } from "antd"; -import { processStreamingResponse } from "./process_stream"; +import { TokenUsage } from "../ResponseMetrics"; export async function makeOpenAIChatCompletionRequest( chatHistory: { role: string; content: string }[], @@ -10,7 +10,9 @@ export async function makeOpenAIChatCompletionRequest( accessToken: string, tags?: string[], signal?: AbortSignal, - onReasoningContent?: (content: string) => void + onReasoningContent?: (content: string) => void, + onTimingData?: (timeToFirstToken: number) => void, + onUsageData?: (usage: TokenUsage) => void ) { // base url should be the current base_url const isLocal = process.env.NODE_ENV === "development"; @@ -22,13 +24,21 @@ export async function makeOpenAIChatCompletionRequest( ? "http://localhost:4000" : window.location.origin; const client = new openai.OpenAI({ - apiKey: accessToken, // Replace with your OpenAI API key - baseURL: proxyBaseUrl, // Replace with your OpenAI API base URL - dangerouslyAllowBrowser: true, // using a temporary litellm proxy key + apiKey: accessToken, + baseURL: proxyBaseUrl, + dangerouslyAllowBrowser: true, defaultHeaders: tags && tags.length > 0 ? { 'x-litellm-tags': tags.join(',') } : undefined, }); try { + const startTime = Date.now(); + let firstTokenReceived = false; + let timeToFirstToken: number | undefined = undefined; + + // For collecting complete response text + let fullResponseContent = ""; + let fullReasoningContent = ""; + const response = await client.chat.completions.create({ model: selectedModel, stream: true, @@ -36,12 +46,72 @@ export async function makeOpenAIChatCompletionRequest( }, { signal }); for await (const chunk of response) { - console.log(chunk); - // Process the chunk using our utility - processStreamingResponse(chunk, { - onContent: updateUI, - onReasoningContent: onReasoningContent || (() => {}) - }); + console.log("Stream chunk:", chunk); + + // Measure time to first token + if (!firstTokenReceived && chunk.choices[0]?.delta?.content) { + firstTokenReceived = true; + timeToFirstToken = Date.now() - startTime; + if (onTimingData) { + onTimingData(timeToFirstToken); + } + } + + // Process content + if (chunk.choices[0]?.delta?.content) { + const content = chunk.choices[0].delta.content; + updateUI(content, chunk.model); + fullResponseContent += content; + } + + // Process reasoning content if present - using type assertion + const delta = chunk.choices[0]?.delta as any; + if (delta && delta.reasoning_content) { + const reasoningContent = delta.reasoning_content; + if (onReasoningContent) { + onReasoningContent(reasoningContent); + } + fullReasoningContent += reasoningContent; + } + + // Check for usage data using type assertion + const chunkWithUsage = chunk as any; + if (chunkWithUsage.usage && onUsageData) { + console.log("Usage data found:", chunkWithUsage.usage); + const usageData: TokenUsage = { + completionTokens: chunkWithUsage.usage.completion_tokens, + promptTokens: chunkWithUsage.usage.prompt_tokens, + totalTokens: chunkWithUsage.usage.total_tokens, + }; + + // Check for reasoning tokens + if (chunkWithUsage.usage.completion_tokens_details?.reasoning_tokens) { + usageData.reasoningTokens = chunkWithUsage.usage.completion_tokens_details.reasoning_tokens; + } + + onUsageData(usageData); + } + } + + // Always create an estimated usage + if (onUsageData) { + try { + console.log("Creating estimated usage data"); + // Create a simple usage estimate - approximately 4 characters per token + const estimatedUsage: TokenUsage = { + promptTokens: Math.ceil(JSON.stringify(chatHistory).length / 4), + completionTokens: Math.ceil((fullResponseContent.length) / 4), + totalTokens: Math.ceil((JSON.stringify(chatHistory).length + fullResponseContent.length) / 4) + }; + + if (fullReasoningContent) { + estimatedUsage.reasoningTokens = Math.ceil(fullReasoningContent.length / 4); + } + + onUsageData(estimatedUsage); + } catch (error) { + console.error("Error estimating usage data:", error); + } } } catch (error) { if (signal?.aborted) { diff --git a/ui/litellm-dashboard/src/components/chat_ui/llm_calls/process_stream.tsx b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/process_stream.tsx index 3af3fadc01..bb6058ab55 100644 --- a/ui/litellm-dashboard/src/components/chat_ui/llm_calls/process_stream.tsx +++ b/ui/litellm-dashboard/src/components/chat_ui/llm_calls/process_stream.tsx @@ -1,8 +1,59 @@ -import { StreamingResponse } from "../types"; +import { TokenUsage } from "../ResponseMetrics"; + +export interface StreamingResponse { + id: string; + created: number; + model: string; + object: string; + system_fingerprint?: string; + choices: StreamingChoices[]; + provider_specific_fields?: any; + stream_options?: any; + citations?: any; + usage?: Usage; +} + +export interface StreamingChoices { + finish_reason?: string | null; + index: number; + delta: Delta; + logprobs?: any; +} + +export interface Delta { + content?: string; + reasoning_content?: string; + role?: string; + function_call?: any; + tool_calls?: any; + audio?: any; + refusal?: any; + provider_specific_fields?: any; +} + +export interface Usage { + completion_tokens: number; + prompt_tokens: number; + total_tokens: number; + completion_tokens_details?: { + accepted_prediction_tokens?: number; + audio_tokens?: number; + reasoning_tokens?: number; + rejected_prediction_tokens?: number; + text_tokens?: number | null; + }; + prompt_tokens_details?: { + audio_tokens?: number; + cached_tokens?: number; + text_tokens?: number; + image_tokens?: number; + }; +} export interface StreamProcessCallbacks { onContent: (content: string, model?: string) => void; onReasoningContent: (content: string) => void; + onUsage?: (usage: TokenUsage) => void; } export const processStreamingResponse = ( @@ -25,4 +76,21 @@ export const processStreamingResponse = ( callbacks.onReasoningContent(choice.delta.reasoning_content); } } + + // Process usage information if it exists and we have a handler + if (response.usage && callbacks.onUsage) { + console.log("Processing usage data:", response.usage); + const usageData: TokenUsage = { + completionTokens: response.usage.completion_tokens, + promptTokens: response.usage.prompt_tokens, + totalTokens: response.usage.total_tokens, + }; + + // Extract reasoning tokens if available + if (response.usage.completion_tokens_details?.reasoning_tokens) { + usageData.reasoningTokens = response.usage.completion_tokens_details.reasoning_tokens; + } + + callbacks.onUsage(usageData); + } }; \ No newline at end of file diff --git a/ui/litellm-dashboard/src/components/chat_ui/types.ts b/ui/litellm-dashboard/src/components/chat_ui/types.ts index 9ddc69af5e..e475e5f93d 100644 --- a/ui/litellm-dashboard/src/components/chat_ui/types.ts +++ b/ui/litellm-dashboard/src/components/chat_ui/types.ts @@ -9,6 +9,29 @@ export interface Delta { provider_specific_fields?: any; } +export interface CompletionTokensDetails { + accepted_prediction_tokens?: number; + audio_tokens?: number; + reasoning_tokens?: number; + rejected_prediction_tokens?: number; + text_tokens?: number | null; +} + +export interface PromptTokensDetails { + audio_tokens?: number; + cached_tokens?: number; + text_tokens?: number; + image_tokens?: number; +} + +export interface Usage { + completion_tokens: number; + prompt_tokens: number; + total_tokens: number; + completion_tokens_details?: CompletionTokensDetails; + prompt_tokens_details?: PromptTokensDetails; +} + export interface StreamingChoices { finish_reason?: string | null; index: number; @@ -26,6 +49,7 @@ export interface StreamingResponse { provider_specific_fields?: any; stream_options?: any; citations?: any; + usage?: Usage; } export interface MessageType { @@ -34,4 +58,11 @@ export interface MessageType { model?: string; isImage?: boolean; reasoningContent?: string; + timeToFirstToken?: number; + usage?: { + completionTokens?: number; + promptTokens?: number; + totalTokens?: number; + reasoningTokens?: number; + }; } \ No newline at end of file