Litellm dev 12 24 2024 p4 (#7407)

* fix(invoke_handler.py): fix mock response iterator to handle tool calling returns tool call if returned by model response * fix(prometheus.py): add new 'tokens_by_tag' metric on prometheus allows tracking 'token usage' by task * feat(prometheus.py): add input + output token tracking by tag * feat(prometheus.py): add tag based deployment failure tracking allows admin to track failure by use-case
2025-04-26 03:04:13 +00:00 · 2024-12-24 20:24:06 -08:00 · 2024-12-24 20:24:06 -08:00 · 39dabb2e89
commit 39dabb2e89
parent 81be0b4090
5 changed files with 209 additions and 12 deletions
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -9,7 +9,17 @@ import types
 import urllib.parse
 import uuid
 from functools import partial
-from typing import Any, AsyncIterator, Callable, Iterator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)

 import httpx  # type: ignore

@ -36,8 +46,10 @@ from litellm.llms.custom_httpx.http_handler import (
 from litellm.types.llms.bedrock import *
 from litellm.types.llms.openai import (
    ChatCompletionToolCallChunk,
+    ChatCompletionToolCallFunctionChunk,
    ChatCompletionUsageBlock,
 )
+from litellm.types.utils import ChatCompletionMessageToolCall, Choices
 from litellm.types.utils import GenericStreamingChunk as GChunk
 from litellm.types.utils import ModelResponse, Usage
 from litellm.utils import CustomStreamWrapper, get_secret
@ -1294,11 +1306,25 @@ class MockResponseIterator:  # for returning ai21 streaming responses
            chunk_usage: Usage = getattr(chunk_data, "usage")
            text = chunk_data.choices[0].message.content or ""  # type: ignore
            tool_use = None
+            _model_response_tool_call = cast(
+                Optional[List[ChatCompletionMessageToolCall]],
+                cast(Choices, chunk_data.choices[0]).message.tool_calls,
+            )
            if self.json_mode is True:
                text, tool_use = self._handle_json_mode_chunk(
                    text=text,
                    tool_calls=chunk_data.choices[0].message.tool_calls,  # type: ignore
                )
+            elif _model_response_tool_call is not None:
+                tool_use = ChatCompletionToolCallChunk(
+                    id=_model_response_tool_call[0].id,
+                    type="function",
+                    function=ChatCompletionToolCallFunctionChunk(
+                        name=_model_response_tool_call[0].function.name,
+                        arguments=_model_response_tool_call[0].function.arguments,
+                    ),
+                    index=0,
+                )
            processed_chunk = GChunk(
                text=text,
                tool_use=tool_use,