LiteLLM Minor Fixes & Improvements (01/08/2025) - p2 (#7643)

* fix(streaming_chunk_builder_utils.py): add test for groq tool calling + streaming + combine chunks Addresses https://github.com/BerriAI/litellm/issues/7621 * fix(streaming_utils.py): fix modelresponseiterator for openai like chunk parser ensures chunk parser uses the correct tool call id when translating the chunk Fixes https://github.com/BerriAI/litellm/issues/7621 * build(model_hub.tsx): display cost pricing on model hub * build(model_hub.tsx): show cost per token pricing + complete model information * fix(types/utils.py): fix usage object handling
2025-04-25 18:54:30 +00:00 · 2025-01-08 19:45:19 -08:00 · 2025-01-08 19:45:19 -08:00 · 1e3370f3cb
commit 1e3370f3cb
parent 39ee4c6bb4
9 changed files with 206 additions and 21 deletions
--- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
+++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@ -103,7 +103,8 @@ class ChunkProcessor:
    def get_combined_tool_content(
        self, tool_call_chunks: List[Dict[str, Any]]
    ) -> List[ChatCompletionMessageToolCall]:
-        argument_list: List = []
+
+        argument_list: List[str] = []
        delta = tool_call_chunks[0]["choices"][0]["delta"]
        id = None
        name = None
@ -171,6 +172,7 @@ class ChunkProcessor:
                ),
            )
        )
+
        return tool_calls_list

    def get_combined_function_call_content(
--- a/litellm/llms/databricks/streaming_utils.py
+++ b/litellm/llms/databricks/streaming_utils.py
@ -17,7 +17,7 @@ class ModelResponseIterator:

    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
        try:
-            processed_chunk = litellm.ModelResponse(**chunk, stream=True)  # type: ignore
+            processed_chunk = litellm.ModelResponseStream(**chunk)

            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
@ -46,7 +46,7 @@ class ModelResponseIterator:
                        .delta.tool_calls[0]  # type: ignore
                        .function.arguments,
                    ),
-                    index=processed_chunk.choices[0].index,
+                    index=processed_chunk.choices[0].delta.tool_calls[0].index,
                )

            if processed_chunk.choices[0].finish_reason is not None:
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -25,7 +25,3 @@ model_list:
        identifier: deepseek-ai/DeepSeek-V3-Base
        revision: main
        auth_token: os.environ/HUGGINGFACE_API_KEY
-
-  service_callback: ["prometheus_system"]
-  callbacks: ["prometheus"]
-  cache: true
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -847,6 +847,13 @@ class ModelResponseStream(ModelResponseBase):
        else:
            created = created

+        if (
+            "usage" in kwargs
+            and kwargs["usage"] is not None
+            and isinstance(kwargs["usage"], dict)
+        ):
+            kwargs["usage"] = Usage(**kwargs["usage"])
+
        kwargs["id"] = id
        kwargs["created"] = created
        kwargs["object"] = "chat.completion.chunk"
--- a/tests/local_testing/test_stream_chunk_builder.py
+++ b/tests/local_testing/test_stream_chunk_builder.py
@ -747,6 +747,125 @@ def test_stream_chunk_builder_empty_initial_chunk():
    assert id == "1"


+def test_stream_chunk_builder_tool_calls_list():
+    from litellm.litellm_core_utils.streaming_chunk_builder_utils import (
+        ChunkProcessor,
+    )
+    from litellm.types.utils import (
+        ChatCompletionMessageToolCall,
+        Function,
+        ModelResponseStream,
+        Delta,
+        StreamingChoices,
+        ChatCompletionDeltaToolCall,
+    )
+
+    chunks = [
+        ModelResponseStream(
+            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
+            created=1736388417,
+            model="llama-3.3-70b-versatile",
+            object="chat.completion.chunk",
+            system_fingerprint=None,
+            choices=[
+                StreamingChoices(
+                    finish_reason=None,
+                    index=0,
+                    delta=Delta(
+                        content="",
+                        role="assistant",
+                        function_call=None,
+                        tool_calls=[
+                            ChatCompletionDeltaToolCall(
+                                id="call_9y79",
+                                function=Function(
+                                    arguments='{"location": "San Francisco", "unit": "celsius"}',
+                                    name="get_current_weather",
+                                ),
+                                type="function",
+                                index=0,
+                            )
+                        ],
+                        audio=None,
+                    ),
+                    logprobs=None,
+                )
+            ],
+            stream_options=None,
+        ),
+        ModelResponseStream(
+            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
+            created=1736388417,
+            model="llama-3.3-70b-versatile",
+            object="chat.completion.chunk",
+            system_fingerprint=None,
+            choices=[
+                StreamingChoices(
+                    finish_reason=None,
+                    index=0,
+                    delta=Delta(
+                        content="",
+                        role=None,
+                        function_call=None,
+                        tool_calls=[
+                            ChatCompletionDeltaToolCall(
+                                id="call_pfp7",
+                                function=Function(
+                                    arguments='{"location": "Tokyo", "unit": "celsius"}',
+                                    name="get_current_weather",
+                                ),
+                                type="function",
+                                index=1,
+                            )
+                        ],
+                        audio=None,
+                    ),
+                    logprobs=None,
+                )
+            ],
+            stream_options=None,
+        ),
+        ModelResponseStream(
+            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
+            created=1736388417,
+            model="llama-3.3-70b-versatile",
+            object="chat.completion.chunk",
+            system_fingerprint=None,
+            choices=[
+                StreamingChoices(
+                    finish_reason=None,
+                    index=0,
+                    delta=Delta(
+                        content="",
+                        role=None,
+                        function_call=None,
+                        tool_calls=[
+                            ChatCompletionDeltaToolCall(
+                                id="call_hyj5",
+                                function=Function(
+                                    arguments='{"location": "Paris", "unit": "celsius"}',
+                                    name="get_current_weather",
+                                ),
+                                type="function",
+                                index=2,
+                            )
+                        ],
+                        audio=None,
+                    ),
+                    logprobs=None,
+                )
+            ],
+            stream_options=None,
+        ),
+    ]
+
+    processor = ChunkProcessor(chunks=chunks)
+
+    tool_calls = processor.get_combined_tool_content(tool_call_chunks=chunks)
+    print(f"tool_calls: {tool_calls}")
+    assert len(tool_calls) == 3
+
+
 import json


@ -762,3 +881,55 @@ def get_current_weather(location, unit="fahrenheit"):
        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+@pytest.fixture(scope="module", autouse=True)
+def load_env():
+    messages = [
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {
+            "role": "user",
+            "content": "What's the weather like in San Francisco, Tokyo, and Paris?",
+        },
+    ]
+    tools = [
+        {
+            "type": "function",
+            "function": litellm.utils.function_to_dict(get_current_weather),
+        }
+    ]
+    OPENAI_GPT4oMINI = {
+        "messages": messages,
+        "model": "gpt-4o-mini",
+        "temperature": 0.0,
+        "tools": tools,
+        "stream": True,
+    }
+    LLAMA3_3 = {
+        "messages": messages,
+        "model": "groq/llama-3.3-70b-versatile",
+        "api_base": "https://api.groq.com/openai/v1",
+        "temperature": 0.0,
+        "tools": tools,
+        "stream": True,
+    }
+    return OPENAI_GPT4oMINI, LLAMA3_3
+
+
+def execute_completion(opts: dict):
+    partial_streaming_chunks = []
+    response_gen = litellm.completion(**opts)
+    for i, part in enumerate(response_gen):
+        partial_streaming_chunks.append(part)
+    assembly = litellm.stream_chunk_builder(partial_streaming_chunks)
+    print(assembly.choices[0].message.tool_calls)
+    assert len(assembly.choices[0].message.tool_calls) == 3, (
+        assembly.choices[0].message.tool_calls[0].function.arguments[0]
+    )
+    print(assembly.choices[0].message.tool_calls)
+
+
+def test_grok_bug(load_env):
+    litellm.set_verbose = True
+    _, LLAMA3_3 = load_env
+    execute_completion(LLAMA3_3)
--- a/ui/litellm-dashboard/src/components/model_hub.tsx
+++ b/ui/litellm-dashboard/src/components/model_hub.tsx
@ -34,6 +34,8 @@ interface ModelInfo {
  supports_vision: boolean;
  max_input_tokens?: number;
  max_output_tokens?: number;
+  input_cost_per_token?: number;
+  output_cost_per_token?: number;
  supported_openai_params?: string[];
 }

@ -161,26 +163,29 @@ const ModelHub: React.FC<ModelHubProps> = ({
                    </Tooltip>
                  </pre>
                  <div className="my-5">
-                    <Text>Mode: {model.mode}</Text>
-                    <Text>
-                      Supports Function Calling:{" "}
-                      {model?.supports_function_calling == true ? "Yes" : "No"}
-                    </Text>
-                    <Text>
-                      Supports Vision:{" "}
-                      {model?.supports_vision == true ? "Yes" : "No"}
-                    </Text>
                    <Text>
                      Max Input Tokens:{" "}
                      {model?.max_input_tokens
                        ? model?.max_input_tokens
-                        : "N/A"}
+                        : "Unknown"}
                    </Text>
                    <Text>
                      Max Output Tokens:{" "}
                      {model?.max_output_tokens
                        ? model?.max_output_tokens
-                        : "N/A"}
+                        : "Unknown"}
+                    </Text>
+                    <Text>
+                      Input Cost Per Token:{" "}
+                      {model?.input_cost_per_token
+                        ? `$${(model.input_cost_per_token * 1_000_000).toFixed(2)}`
+                        : "Unknown"}
+                    </Text>
+                    <Text>
+                      Output Cost Per Token:{" "}
+                      {model?.output_cost_per_token
+                        ? `$${(model.output_cost_per_token * 1_000_000).toFixed(2)}`
+                        : "Unknown"}
                    </Text>
                  </div>
                  <div style={{ marginTop: "auto", textAlign: "right" }}>
@ -245,12 +250,19 @@ const ModelHub: React.FC<ModelHubProps> = ({

            <TabGroup>
              <TabList>
+                <Tab>Model Information</Tab>
                <Tab>OpenAI Python SDK</Tab>
                <Tab>Supported OpenAI Params</Tab>
                <Tab>LlamaIndex</Tab>
                <Tab>Langchain Py</Tab>
              </TabList>
              <TabPanels>
+                <TabPanel>
+                  <Text>
+                    <strong>Model Group:</strong> 
+                    <pre>{JSON.stringify(selectedModel, null, 2)}</pre>
+                  </Text>
+                </TabPanel>
                <TabPanel>
                  <SyntaxHighlighter language="python">
                    {`