From 39486e2003b49db092c436cc664bc8d11d47c993 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Mon, 14 Oct 2024 22:11:14 -0700
Subject: [PATCH] Litellm dev 10 14 2024 (#6221)

* fix(__init__.py): expose DualCache, RedisCache, InMemoryCache on root

abstract internal file refactors from impacting users

* feat(utils.py): handle invalid openai parallel tool calling response

Fixes https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653

* docs(bedrock.md): clarify all bedrock models are supported

Closes https://github.com/BerriAI/litellm/issues/6168#issuecomment-2412082236
---
 docs/my-website/docs/providers/bedrock.md     |   2 +-
 litellm/__init__.py                           |   2 +-
 .../proxy/hooks/parallel_request_limiter.py   |   3 +-
 litellm/utils.py                              |  65 ++++++-
 tests/local_testing/test_completion.py        | 173 ++++++++++++++++++
 5 files changed, 240 insertions(+), 5 deletions(-)

diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md
index 6548714b2..279098d12 100644
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
 # AWS Bedrock
-Anthropic, Amazon Titan, A121 LLMs are Supported on Bedrock
+ALL Bedrock models (Anthropic, Meta, Mistral, Amazon, etc.) are Supported
 
 LiteLLM requires `boto3` to be installed on your system for Bedrock requests
 ```shell
diff --git a/litellm/__init__.py b/litellm/__init__.py
index f6713646d..5326d1cc6 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -7,7 +7,7 @@ import threading
 import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-from litellm.caching.caching import Cache
+from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
 from litellm._logging import (
     set_verbose,
     _turn_on_debug,
diff --git a/litellm/proxy/hooks/parallel_request_limiter.py b/litellm/proxy/hooks/parallel_request_limiter.py
index 36e5fecff..f34a9bbac 100644
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@@ -7,9 +7,8 @@ from fastapi import HTTPException
 from pydantic import BaseModel
 
 import litellm
-from litellm import ModelResponse
+from litellm import DualCache, ModelResponse
 from litellm._logging import verbose_proxy_logger
-from litellm.caching.caching import DualCache
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
 from litellm.proxy._types import CurrentItemRateLimit, UserAPIKeyAuth
diff --git a/litellm/utils.py b/litellm/utils.py
index 2457bdf4c..085fe7116 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -79,6 +79,7 @@ from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.secret_managers.main import get_secret
 from litellm.types.llms.openai import (
     AllMessageValues,
+    ChatCompletionAssistantToolCall,
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionToolParam,
     ChatCompletionToolParamFunctionChunk,
@@ -89,11 +90,13 @@ from litellm.types.utils import (
     OPENAI_RESPONSE_HEADERS,
     CallTypes,
     ChatCompletionDeltaToolCall,
+    ChatCompletionMessageToolCall,
     Choices,
     CostPerToken,
     Delta,
     Embedding,
     EmbeddingResponse,
+    Function,
     ImageResponse,
     Message,
     ModelInfo,
@@ -5612,6 +5615,54 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
     yield model_response_object
 
 
+from collections import defaultdict
+
+
+def _handle_invalid_parallel_tool_calls(
+    tool_calls: List[ChatCompletionMessageToolCall],
+):
+    """
+    Handle hallucinated parallel tool call from openai - https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
+
+    Code modified from: https://github.com/phdowling/openai_multi_tool_use_parallel_patch/blob/main/openai_multi_tool_use_parallel_patch.py
+    """
+
+    if tool_calls is None:
+        return
+
+    replacements: Dict[int, List[ChatCompletionMessageToolCall]] = defaultdict(list)
+    for i, tool_call in enumerate(tool_calls):
+        current_function = tool_call.function.name
+        function_args = json.loads(tool_call.function.arguments)
+        if current_function == "multi_tool_use.parallel":
+            verbose_logger.debug(
+                "OpenAI did a weird pseudo-multi-tool-use call, fixing call structure.."
+            )
+            for _fake_i, _fake_tool_use in enumerate(function_args["tool_uses"]):
+                _function_args = _fake_tool_use["parameters"]
+                _current_function = _fake_tool_use["recipient_name"]
+                if _current_function.startswith("functions."):
+                    _current_function = _current_function[len("functions.") :]
+
+                fixed_tc = ChatCompletionMessageToolCall(
+                    id=f"{tool_call.id}_{_fake_i}",
+                    type="function",
+                    function=Function(
+                        name=_current_function, arguments=json.dumps(_function_args)
+                    ),
+                )
+                replacements[i].append(fixed_tc)
+
+    shift = 0
+    for i, replacement in replacements.items():
+        tool_calls[:] = (
+            tool_calls[: i + shift] + replacement + tool_calls[i + shift + 1 :]
+        )
+        shift += len(replacement)
+
+    return tool_calls
+
+
 def convert_to_model_response_object(
     response_object: Optional[dict] = None,
     model_response_object: Optional[
@@ -5707,6 +5758,18 @@ def convert_to_model_response_object(
             for idx, choice in enumerate(response_object["choices"]):
                 ## HANDLE JSON MODE - anthropic returns single function call]
                 tool_calls = choice["message"].get("tool_calls", None)
+                if tool_calls is not None:
+                    _openai_tool_calls = []
+                    for _tc in tool_calls:
+                        _openai_tc = ChatCompletionMessageToolCall(**_tc)
+                        _openai_tool_calls.append(_openai_tc)
+                    fixed_tool_calls = _handle_invalid_parallel_tool_calls(
+                        _openai_tool_calls
+                    )
+
+                    if fixed_tool_calls is not None:
+                        tool_calls = fixed_tool_calls
+
                 message: Optional[Message] = None
                 finish_reason: Optional[str] = None
                 if (
@@ -5726,7 +5789,7 @@ def convert_to_model_response_object(
                         content=choice["message"].get("content", None),
                         role=choice["message"]["role"] or "assistant",
                         function_call=choice["message"].get("function_call", None),
-                        tool_calls=choice["message"].get("tool_calls", None),
+                        tool_calls=tool_calls,
                     )
                     finish_reason = choice.get("finish_reason", None)
                 if finish_reason is None:
diff --git a/tests/local_testing/test_completion.py b/tests/local_testing/test_completion.py
index ecc3c8034..33c0b67f1 100644
--- a/tests/local_testing/test_completion.py
+++ b/tests/local_testing/test_completion.py
@@ -4567,3 +4567,176 @@ def test_completion_response_ratelimit_headers(model, stream):
         assert v != "None" and v is not None
     assert "x-ratelimit-remaining-requests" in additional_headers
     assert "x-ratelimit-remaining-tokens" in additional_headers
+
+
+def _openai_hallucinated_tool_call_mock_response(
+    *args, **kwargs
+) -> litellm.ModelResponse:
+    new_response = MagicMock()
+    new_response.headers = {"hello": "world"}
+
+    response_object = {
+        "id": "chatcmpl-123",
+        "object": "chat.completion",
+        "created": 1677652288,
+        "model": "gpt-3.5-turbo-0125",
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "content": None,
+                    "role": "assistant",
+                    "tool_calls": [
+                        {
+                            "function": {
+                                "arguments": '{"tool_uses":[{"recipient_name":"product_title","parameters":{"content":"Story Scribe"}},{"recipient_name":"one_liner","parameters":{"content":"Transform interview transcripts into actionable user stories"}}]}',
+                                "name": "multi_tool_use.parallel",
+                            },
+                            "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s",
+                            "type": "function",
+                        }
+                    ],
+                },
+                "logprobs": None,
+                "finish_reason": "stop",
+            }
+        ],
+        "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
+    }
+    from openai import OpenAI
+    from openai.types.chat.chat_completion import ChatCompletion
+
+    pydantic_obj = ChatCompletion(**response_object)  # type: ignore
+    pydantic_obj.choices[0].message.role = None  # type: ignore
+    new_response.parse.return_value = pydantic_obj
+    return new_response
+
+
+def test_openai_hallucinated_tool_call():
+    """
+    Patch for this issue: https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
+
+    Handle openai invalid tool calling response.
+
+    OpenAI assistant will sometimes return an invalid tool calling response, which needs to be parsed
+
+    -           "arguments": "{\"tool_uses\":[{\"recipient_name\":\"product_title\",\"parameters\":{\"content\":\"Story Scribe\"}},{\"recipient_name\":\"one_liner\",\"parameters\":{\"content\":\"Transform interview transcripts into actionable user stories\"}}]}",
+
+    To extract actual tool calls:
+
+    1. Parse arguments JSON object
+    2. Iterate over tool_uses array to call functions:
+        - get function name from recipient_name value
+        - parameters will be JSON object for function arguments
+    """
+    import openai
+
+    openai_client = openai.OpenAI()
+    with patch.object(
+        openai_client.chat.completions,
+        "create",
+        side_effect=_openai_hallucinated_tool_call_mock_response,
+    ) as mock_response:
+        response = litellm.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey! how's it going?"}],
+            client=openai_client,
+        )
+        print(f"response: {response}")
+
+        response_dict = response.model_dump()
+
+        tool_calls = response_dict["choices"][0]["message"]["tool_calls"]
+
+        print(f"tool_calls: {tool_calls}")
+
+        for idx, tc in enumerate(tool_calls):
+            if idx == 0:
+                print(f"tc in test_openai_hallucinated_tool_call: {tc}")
+                assert tc == {
+                    "function": {
+                        "arguments": '{"content": "Story Scribe"}',
+                        "name": "product_title",
+                    },
+                    "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s_0",
+                    "type": "function",
+                }
+            elif idx == 1:
+                assert tc == {
+                    "function": {
+                        "arguments": '{"content": "Transform interview transcripts into actionable user stories"}',
+                        "name": "one_liner",
+                    },
+                    "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s_1",
+                    "type": "function",
+                }
+
+
+@pytest.mark.parametrize(
+    "function_name, expect_modification",
+    [
+        ("multi_tool_use.parallel", True),
+        ("my-fake-function", False),
+    ],
+)
+def test_openai_hallucinated_tool_call_util(function_name, expect_modification):
+    """
+    Patch for this issue: https://community.openai.com/t/model-tries-to-call-unknown-function-multi-tool-use-parallel/490653
+
+    Handle openai invalid tool calling response.
+
+    OpenAI assistant will sometimes return an invalid tool calling response, which needs to be parsed
+
+    -           "arguments": "{\"tool_uses\":[{\"recipient_name\":\"product_title\",\"parameters\":{\"content\":\"Story Scribe\"}},{\"recipient_name\":\"one_liner\",\"parameters\":{\"content\":\"Transform interview transcripts into actionable user stories\"}}]}",
+
+    To extract actual tool calls:
+
+    1. Parse arguments JSON object
+    2. Iterate over tool_uses array to call functions:
+        - get function name from recipient_name value
+        - parameters will be JSON object for function arguments
+    """
+    from litellm.utils import _handle_invalid_parallel_tool_calls
+    from litellm.types.utils import ChatCompletionMessageToolCall
+
+    response = _handle_invalid_parallel_tool_calls(
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                **{
+                    "function": {
+                        "arguments": '{"tool_uses":[{"recipient_name":"product_title","parameters":{"content":"Story Scribe"}},{"recipient_name":"one_liner","parameters":{"content":"Transform interview transcripts into actionable user stories"}}]}',
+                        "name": function_name,
+                    },
+                    "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s",
+                    "type": "function",
+                }
+            )
+        ]
+    )
+
+    print(f"response: {response}")
+
+    if expect_modification:
+        for idx, tc in enumerate(response):
+            if idx == 0:
+                assert tc.model_dump() == {
+                    "function": {
+                        "arguments": '{"content": "Story Scribe"}',
+                        "name": "product_title",
+                    },
+                    "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s_0",
+                    "type": "function",
+                }
+            elif idx == 1:
+                assert tc.model_dump() == {
+                    "function": {
+                        "arguments": '{"content": "Transform interview transcripts into actionable user stories"}',
+                        "name": "one_liner",
+                    },
+                    "id": "call_IzGXwVa5OfBd9XcCJOkt2q0s_1",
+                    "type": "function",
+                }
+    else:
+        assert len(response) == 1
+        assert response[0].function.name == function_name