LiteLLM Minor Fixes & Improvements (01/08/2025) - p2 (#7643)

* fix(streaming_chunk_builder_utils.py): add test for groq tool calling + streaming + combine chunks

Addresses https://github.com/BerriAI/litellm/issues/7621

* fix(streaming_utils.py): fix modelresponseiterator for openai like chunk parser

ensures chunk parser uses the correct tool call id when translating the chunk

 Fixes https://github.com/BerriAI/litellm/issues/7621

* build(model_hub.tsx): display cost pricing on model hub

* build(model_hub.tsx): show cost per token pricing + complete model information

* fix(types/utils.py): fix usage object handling
This commit is contained in:
Krish Dholakia 2025-01-08 19:45:19 -08:00 committed by GitHub
parent 39ee4c6bb4
commit 1e3370f3cb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 206 additions and 21 deletions

View file

@ -103,7 +103,8 @@ class ChunkProcessor:
def get_combined_tool_content( def get_combined_tool_content(
self, tool_call_chunks: List[Dict[str, Any]] self, tool_call_chunks: List[Dict[str, Any]]
) -> List[ChatCompletionMessageToolCall]: ) -> List[ChatCompletionMessageToolCall]:
argument_list: List = []
argument_list: List[str] = []
delta = tool_call_chunks[0]["choices"][0]["delta"] delta = tool_call_chunks[0]["choices"][0]["delta"]
id = None id = None
name = None name = None
@ -171,6 +172,7 @@ class ChunkProcessor:
), ),
) )
) )
return tool_calls_list return tool_calls_list
def get_combined_function_call_content( def get_combined_function_call_content(

View file

@ -17,7 +17,7 @@ class ModelResponseIterator:
def chunk_parser(self, chunk: dict) -> GenericStreamingChunk: def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
try: try:
processed_chunk = litellm.ModelResponse(**chunk, stream=True) # type: ignore processed_chunk = litellm.ModelResponseStream(**chunk)
text = "" text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None tool_use: Optional[ChatCompletionToolCallChunk] = None
@ -46,7 +46,7 @@ class ModelResponseIterator:
.delta.tool_calls[0] # type: ignore .delta.tool_calls[0] # type: ignore
.function.arguments, .function.arguments,
), ),
index=processed_chunk.choices[0].index, index=processed_chunk.choices[0].delta.tool_calls[0].index,
) )
if processed_chunk.choices[0].finish_reason is not None: if processed_chunk.choices[0].finish_reason is not None:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -25,7 +25,3 @@ model_list:
identifier: deepseek-ai/DeepSeek-V3-Base identifier: deepseek-ai/DeepSeek-V3-Base
revision: main revision: main
auth_token: os.environ/HUGGINGFACE_API_KEY auth_token: os.environ/HUGGINGFACE_API_KEY
service_callback: ["prometheus_system"]
callbacks: ["prometheus"]
cache: true

View file

@ -847,6 +847,13 @@ class ModelResponseStream(ModelResponseBase):
else: else:
created = created created = created
if (
"usage" in kwargs
and kwargs["usage"] is not None
and isinstance(kwargs["usage"], dict)
):
kwargs["usage"] = Usage(**kwargs["usage"])
kwargs["id"] = id kwargs["id"] = id
kwargs["created"] = created kwargs["created"] = created
kwargs["object"] = "chat.completion.chunk" kwargs["object"] = "chat.completion.chunk"

View file

@ -747,6 +747,125 @@ def test_stream_chunk_builder_empty_initial_chunk():
assert id == "1" assert id == "1"
def test_stream_chunk_builder_tool_calls_list():
from litellm.litellm_core_utils.streaming_chunk_builder_utils import (
ChunkProcessor,
)
from litellm.types.utils import (
ChatCompletionMessageToolCall,
Function,
ModelResponseStream,
Delta,
StreamingChoices,
ChatCompletionDeltaToolCall,
)
chunks = [
ModelResponseStream(
id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
created=1736388417,
model="llama-3.3-70b-versatile",
object="chat.completion.chunk",
system_fingerprint=None,
choices=[
StreamingChoices(
finish_reason=None,
index=0,
delta=Delta(
content="",
role="assistant",
function_call=None,
tool_calls=[
ChatCompletionDeltaToolCall(
id="call_9y79",
function=Function(
arguments='{"location": "San Francisco", "unit": "celsius"}',
name="get_current_weather",
),
type="function",
index=0,
)
],
audio=None,
),
logprobs=None,
)
],
stream_options=None,
),
ModelResponseStream(
id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
created=1736388417,
model="llama-3.3-70b-versatile",
object="chat.completion.chunk",
system_fingerprint=None,
choices=[
StreamingChoices(
finish_reason=None,
index=0,
delta=Delta(
content="",
role=None,
function_call=None,
tool_calls=[
ChatCompletionDeltaToolCall(
id="call_pfp7",
function=Function(
arguments='{"location": "Tokyo", "unit": "celsius"}',
name="get_current_weather",
),
type="function",
index=1,
)
],
audio=None,
),
logprobs=None,
)
],
stream_options=None,
),
ModelResponseStream(
id="chatcmpl-f323f7a5-2da0-4f86-8ed7-c653c5a359d9",
created=1736388417,
model="llama-3.3-70b-versatile",
object="chat.completion.chunk",
system_fingerprint=None,
choices=[
StreamingChoices(
finish_reason=None,
index=0,
delta=Delta(
content="",
role=None,
function_call=None,
tool_calls=[
ChatCompletionDeltaToolCall(
id="call_hyj5",
function=Function(
arguments='{"location": "Paris", "unit": "celsius"}',
name="get_current_weather",
),
type="function",
index=2,
)
],
audio=None,
),
logprobs=None,
)
],
stream_options=None,
),
]
processor = ChunkProcessor(chunks=chunks)
tool_calls = processor.get_combined_tool_content(tool_call_chunks=chunks)
print(f"tool_calls: {tool_calls}")
assert len(tool_calls) == 3
import json import json
@ -762,3 +881,55 @@ def get_current_weather(location, unit="fahrenheit"):
return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"}) return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
else: else:
return json.dumps({"location": location, "temperature": "unknown"}) return json.dumps({"location": location, "temperature": "unknown"})
@pytest.fixture(scope="module", autouse=True)
def load_env():
messages = [
{"role": "system", "content": "You are a helpful AI assistant"},
{
"role": "user",
"content": "What's the weather like in San Francisco, Tokyo, and Paris?",
},
]
tools = [
{
"type": "function",
"function": litellm.utils.function_to_dict(get_current_weather),
}
]
OPENAI_GPT4oMINI = {
"messages": messages,
"model": "gpt-4o-mini",
"temperature": 0.0,
"tools": tools,
"stream": True,
}
LLAMA3_3 = {
"messages": messages,
"model": "groq/llama-3.3-70b-versatile",
"api_base": "https://api.groq.com/openai/v1",
"temperature": 0.0,
"tools": tools,
"stream": True,
}
return OPENAI_GPT4oMINI, LLAMA3_3
def execute_completion(opts: dict):
partial_streaming_chunks = []
response_gen = litellm.completion(**opts)
for i, part in enumerate(response_gen):
partial_streaming_chunks.append(part)
assembly = litellm.stream_chunk_builder(partial_streaming_chunks)
print(assembly.choices[0].message.tool_calls)
assert len(assembly.choices[0].message.tool_calls) == 3, (
assembly.choices[0].message.tool_calls[0].function.arguments[0]
)
print(assembly.choices[0].message.tool_calls)
def test_grok_bug(load_env):
litellm.set_verbose = True
_, LLAMA3_3 = load_env
execute_completion(LLAMA3_3)

View file

@ -34,6 +34,8 @@ interface ModelInfo {
supports_vision: boolean; supports_vision: boolean;
max_input_tokens?: number; max_input_tokens?: number;
max_output_tokens?: number; max_output_tokens?: number;
input_cost_per_token?: number;
output_cost_per_token?: number;
supported_openai_params?: string[]; supported_openai_params?: string[];
} }
@ -161,26 +163,29 @@ const ModelHub: React.FC<ModelHubProps> = ({
</Tooltip> </Tooltip>
</pre> </pre>
<div className="my-5"> <div className="my-5">
<Text>Mode: {model.mode}</Text>
<Text>
Supports Function Calling:{" "}
{model?.supports_function_calling == true ? "Yes" : "No"}
</Text>
<Text>
Supports Vision:{" "}
{model?.supports_vision == true ? "Yes" : "No"}
</Text>
<Text> <Text>
Max Input Tokens:{" "} Max Input Tokens:{" "}
{model?.max_input_tokens {model?.max_input_tokens
? model?.max_input_tokens ? model?.max_input_tokens
: "N/A"} : "Unknown"}
</Text> </Text>
<Text> <Text>
Max Output Tokens:{" "} Max Output Tokens:{" "}
{model?.max_output_tokens {model?.max_output_tokens
? model?.max_output_tokens ? model?.max_output_tokens
: "N/A"} : "Unknown"}
</Text>
<Text>
Input Cost Per Token:{" "}
{model?.input_cost_per_token
? `$${(model.input_cost_per_token * 1_000_000).toFixed(2)}`
: "Unknown"}
</Text>
<Text>
Output Cost Per Token:{" "}
{model?.output_cost_per_token
? `$${(model.output_cost_per_token * 1_000_000).toFixed(2)}`
: "Unknown"}
</Text> </Text>
</div> </div>
<div style={{ marginTop: "auto", textAlign: "right" }}> <div style={{ marginTop: "auto", textAlign: "right" }}>
@ -245,12 +250,19 @@ const ModelHub: React.FC<ModelHubProps> = ({
<TabGroup> <TabGroup>
<TabList> <TabList>
<Tab>Model Information</Tab>
<Tab>OpenAI Python SDK</Tab> <Tab>OpenAI Python SDK</Tab>
<Tab>Supported OpenAI Params</Tab> <Tab>Supported OpenAI Params</Tab>
<Tab>LlamaIndex</Tab> <Tab>LlamaIndex</Tab>
<Tab>Langchain Py</Tab> <Tab>Langchain Py</Tab>
</TabList> </TabList>
<TabPanels> <TabPanels>
<TabPanel>
<Text>
<strong>Model Group:</strong>
<pre>{JSON.stringify(selectedModel, null, 2)}</pre>
</Text>
</TabPanel>
<TabPanel> <TabPanel>
<SyntaxHighlighter language="python"> <SyntaxHighlighter language="python">
{` {`