LiteLLM Minor Fixes & Improvements (01/08/2025) - p2 (#7643)

* fix(streaming_chunk_builder_utils.py): add test for groq tool calling + streaming + combine chunks Addresses https://github.com/BerriAI/litellm/issues/7621 * fix(streaming_utils.py): fix modelresponseiterator for openai like chunk parser ensures chunk parser uses the correct tool call id when translating the chunk Fixes https://github.com/BerriAI/litellm/issues/7621 * build(model_hub.tsx): display cost pricing on model hub * build(model_hub.tsx): show cost per token pricing + complete model information * fix(types/utils.py): fix usage object handling
2025-04-25 18:54:30 +00:00 · 2025-01-08 19:45:19 -08:00 · 2025-01-08 19:45:19 -08:00 · 1e3370f3cb
commit 1e3370f3cb
parent 39ee4c6bb4
9 changed files with 206 additions and 21 deletions
--- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
+++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py
@ -103,7 +103,8 @@ class ChunkProcessor:
    def get_combined_tool_content(
        self, tool_call_chunks: List[Dict[str, Any]]
    ) -> List[ChatCompletionMessageToolCall]:
-        argument_list: List = []
+
        argument_list: List[str] = []
        delta = tool_call_chunks[0]["choices"][0]["delta"]
        id = None
        name = None
@ -171,6 +172,7 @@ class ChunkProcessor:
                ),
            )
        )
        return tool_calls_list
    def get_combined_function_call_content(
--- a/litellm/llms/databricks/streaming_utils.py
+++ b/litellm/llms/databricks/streaming_utils.py
@ -17,7 +17,7 @@ class ModelResponseIterator:
    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
        try:
-            processed_chunk = litellm.ModelResponse(**chunk, stream=True)  # type: ignore
+            processed_chunk = litellm.ModelResponseStream(**chunk)
            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
@ -46,7 +46,7 @@ class ModelResponseIterator:
                        .delta.tool_calls[0]  # type: ignore
                        .function.arguments,
                    ),
-                    index=processed_chunk.choices[0].index,
+                    index=processed_chunk.choices[0].delta.tool_calls[0].index,
                )
            if processed_chunk.choices[0].finish_reason is not None:
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/model_hub.html
+++ b/litellm/proxy/_experimental/out/model_hub.html
--- a/litellm/proxy/_experimental/out/onboarding.html
+++ b/litellm/proxy/_experimental/out/onboarding.html
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -25,7 +25,3 @@ model_list:
        identifier: deepseek-ai/DeepSeek-V3-Base
        revision: main
        auth_token: os.environ/HUGGINGFACE_API_KEY
  service_callback: ["prometheus_system"]
  callbacks: ["prometheus"]
  cache: true
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -847,6 +847,13 @@ class ModelResponseStream(ModelResponseBase):
        else:
            created = created
        if (
            "usage" in kwargs
            and kwargs["usage"] is not None
            and isinstance(kwargs["usage"], dict)
        ):
            kwargs["usage"] = Usage(**kwargs["usage"])
        kwargs["id"] = id
        kwargs["created"] = created
        kwargs["object"] = "chat.completion.chunk"
--- a/tests/local_testing/test_stream_chunk_builder.py
+++ b/tests/local_testing/test_stream_chunk_builder.py
@ -747,6 +747,125 @@ def test_stream_chunk_builder_empty_initial_chunk():
    assert id == "1"
 def test_stream_chunk_builder_tool_calls_list():
    from litellm.litellm_core_utils.streaming_chunk_builder_utils import (
        ChunkProcessor,
    )
    from litellm.types.utils import (
        ChatCompletionMessageToolCall,
        Function,
        ModelResponseStream,
        Delta,
        StreamingChoices,
        ChatCompletionDeltaToolCall,
    )
    chunks = [
        ModelResponseStream(
            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
            created=1736388417,
            model="llama-3.3-70b-versatile",
            object="chat.completion.chunk",
            system_fingerprint=None,
            choices=[
                StreamingChoices(
                    finish_reason=None,
                    index=0,
                    delta=Delta(
                        content="",
                        role="assistant",
                        function_call=None,
                        tool_calls=[
                            ChatCompletionDeltaToolCall(
                                id="call_9y79",
                                function=Function(
                                    arguments='{"location": "San Francisco", "unit": "celsius"}',
                                    name="get_current_weather",
                                ),
                                type="function",
                                index=0,
                            )
                        ],
                        audio=None,
                    ),
                    logprobs=None,
                )
            ],
            stream_options=None,
        ),
        ModelResponseStream(
            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
            created=1736388417,
            model="llama-3.3-70b-versatile",
            object="chat.completion.chunk",
            system_fingerprint=None,
            choices=[
                StreamingChoices(
                    finish_reason=None,
                    index=0,
                    delta=Delta(
                        content="",
                        role=None,
                        function_call=None,
                        tool_calls=[
                            ChatCompletionDeltaToolCall(
                                id="call_pfp7",
                                function=Function(
                                    arguments='{"location": "Tokyo", "unit": "celsius"}',
                                    name="get_current_weather",
                                ),
                                type="function",
                                index=1,
                            )
                        ],
                        audio=None,
                    ),
                    logprobs=None,
                )
            ],
            stream_options=None,
        ),
        ModelResponseStream(
            id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
            created=1736388417,
            model="llama-3.3-70b-versatile",
            object="chat.completion.chunk",
            system_fingerprint=None,
            choices=[
                StreamingChoices(
                    finish_reason=None,
                    index=0,
                    delta=Delta(
                        content="",
                        role=None,
                        function_call=None,
                        tool_calls=[
                            ChatCompletionDeltaToolCall(
                                id="call_hyj5",
                                function=Function(
                                    arguments='{"location": "Paris", "unit": "celsius"}',
                                    name="get_current_weather",
                                ),
                                type="function",
                                index=2,
                            )
                        ],
                        audio=None,
                    ),
                    logprobs=None,
                )
            ],
            stream_options=None,
        ),
    ]
    processor = ChunkProcessor(chunks=chunks)
    tool_calls = processor.get_combined_tool_content(tool_call_chunks=chunks)
    print(f"tool_calls: {tool_calls}")
    assert len(tool_calls) == 3
 import json
@ -762,3 +881,55 @@ def get_current_weather(location, unit="fahrenheit"):
        return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})
@pytest.fixture(scope="module", autouse=True)
 def load_env():
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant"},
        {
            "role": "user",
            "content": "What's the weather like in San Francisco, Tokyo, and Paris?",
        },
    ]
    tools = [
        {
            "type": "function",
            "function": litellm.utils.function_to_dict(get_current_weather),
        }
    ]
    OPENAI_GPT4oMINI = {
        "messages": messages,
        "model": "gpt-4o-mini",
        "temperature": 0.0,
        "tools": tools,
        "stream": True,
    }
    LLAMA3_3 = {
        "messages": messages,
        "model": "groq/llama-3.3-70b-versatile",
        "api_base": "https://api.groq.com/openai/v1",
        "temperature": 0.0,
        "tools": tools,
        "stream": True,
    }
    return OPENAI_GPT4oMINI, LLAMA3_3
 def execute_completion(opts: dict):
    partial_streaming_chunks = []
    response_gen = litellm.completion(**opts)
    for i, part in enumerate(response_gen):
        partial_streaming_chunks.append(part)
    assembly = litellm.stream_chunk_builder(partial_streaming_chunks)
    print(assembly.choices[0].message.tool_calls)
    assert len(assembly.choices[0].message.tool_calls) == 3, (
        assembly.choices[0].message.tool_calls[0].function.arguments[0]
    )
    print(assembly.choices[0].message.tool_calls)
 def test_grok_bug(load_env):
    litellm.set_verbose = True
    _, LLAMA3_3 = load_env
    execute_completion(LLAMA3_3)
--- a/ui/litellm-dashboard/src/components/model_hub.tsx
+++ b/ui/litellm-dashboard/src/components/model_hub.tsx
@ -34,6 +34,8 @@ interface ModelInfo {
  supports_vision: boolean;
  max_input_tokens?: number;
  max_output_tokens?: number;
  input_cost_per_token?: number;
  output_cost_per_token?: number;
  supported_openai_params?: string[];
 }
@ -161,26 +163,29 @@ const ModelHub: React.FC<ModelHubProps> = ({
                    </Tooltip>
                  </pre>
                  <div className="my-5">
                    <Text>Mode: {model.mode}</Text>
                    <Text>
                      Supports Function Calling:{" "}
                      {model?.supports_function_calling == true ? "Yes" : "No"}
                    </Text>
                    <Text>
                      Supports Vision:{" "}
                      {model?.supports_vision == true ? "Yes" : "No"}
                    </Text>
                    <Text>
                      Max Input Tokens:{" "}
                      {model?.max_input_tokens
                        ? model?.max_input_tokens
-                        : "N/A"}
+                        : "Unknown"}
                    </Text>
                    <Text>
                      Max Output Tokens:{" "}
                      {model?.max_output_tokens
                        ? model?.max_output_tokens
-                        : "N/A"}
+                        : "Unknown"}
                    </Text>
                    <Text>
                      Input Cost Per Token:{" "}
                      {model?.input_cost_per_token
                        ? `$${(model.input_cost_per_token * 1_000_000).toFixed(2)}`
                        : "Unknown"}
                    </Text>
                    <Text>
                      Output Cost Per Token:{" "}
                      {model?.output_cost_per_token
                        ? `$${(model.output_cost_per_token * 1_000_000).toFixed(2)}`
                        : "Unknown"}
                    </Text>
                  </div>
                  <div style={{ marginTop: "auto", textAlign: "right" }}>
@ -245,12 +250,19 @@ const ModelHub: React.FC<ModelHubProps> = ({
            <TabGroup>
              <TabList>
                <Tab>Model Information</Tab>
                <Tab>OpenAI Python SDK</Tab>
                <Tab>Supported OpenAI Params</Tab>
                <Tab>LlamaIndex</Tab>
                <Tab>Langchain Py</Tab>
              </TabList>
              <TabPanels>
                <TabPanel>
                  <Text>
                    <strong>Model Group:</strong> 
                    <pre>{JSON.stringify(selectedModel, null, 2)}</pre>
                  </Text>
                </TabPanel>
                <TabPanel>
                  <SyntaxHighlighter language="python">
                    {`