fix(utils.py): support returning caching streaming response for function calling streaming calls

2025-04-26 11:14:04 +00:00 · 2024-02-26 12:31:00 -08:00 · 2024-02-26 12:31:00 -08:00 · dfb1d34e26
commit dfb1d34e26
parent 92ff9a1a79
2 changed files with 134 additions and 80 deletions
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -1907,6 +1907,8 @@ def test_azure_streaming_and_function_calling():
@pytest.mark.asyncio
 async def test_azure_astreaming_and_function_calling():
    import uuid
    tools = [
        {
            "type": "function",
@ -1927,7 +1929,20 @@ async def test_azure_astreaming_and_function_calling():
            },
        }
    ]
-    messages = [{"role": "user", "content": "What is the weather like in Boston?"}]
+    messages = [
        {
            "role": "user",
            "content": f"What is the weather like in Boston? {uuid.uuid4()}",
        }
    ]
    from litellm.caching import Cache
    litellm.cache = Cache(
        type="redis",
        host=os.environ["REDIS_HOST"],
        port=os.environ["REDIS_PORT"],
        password=os.environ["REDIS_PASSWORD"],
    )
    try:
        response = await litellm.acompletion(
            model="azure/gpt-4-nov-release",
@ -1938,6 +1953,7 @@ async def test_azure_astreaming_and_function_calling():
            api_base=os.getenv("AZURE_FRANCE_API_BASE"),
            api_key=os.getenv("AZURE_FRANCE_API_KEY"),
            api_version="2024-02-15-preview",
            caching=True,
        )
        # Add any assertions here to check the response
        idx = 0
@ -1957,6 +1973,36 @@ async def test_azure_astreaming_and_function_calling():
                validate_final_streaming_function_calling_chunk(chunk=chunk)
            idx += 1
        ## CACHING TEST
        print("\n\nCACHING TESTS\n\n")
        response = await litellm.acompletion(
            model="azure/gpt-4-nov-release",
            tools=tools,
            tool_choice="auto",
            messages=messages,
            stream=True,
            api_base=os.getenv("AZURE_FRANCE_API_BASE"),
            api_key=os.getenv("AZURE_FRANCE_API_KEY"),
            api_version="2024-02-15-preview",
            caching=True,
        )
        # Add any assertions here to check the response
        idx = 0
        async for chunk in response:
            print(f"chunk: {chunk}")
            if idx == 0:
                assert (
                    chunk.choices[0].delta.tool_calls[0].function.arguments is not None
                )
                assert isinstance(
                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
                )
                validate_first_streaming_function_calling_chunk(chunk=chunk)
            elif idx == 1:
                validate_second_streaming_function_calling_chunk(chunk=chunk)
            elif chunk.choices[0].finish_reason is not None:  # last chunk
                validate_final_streaming_function_calling_chunk(chunk=chunk)
            idx += 1
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
        raise e
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -213,6 +213,13 @@ class Function(OpenAIObject):
    name: str
 class ChatCompletionDeltaToolCall(OpenAIObject):
    id: str
    function: Function
    type: str
    index: int
 class ChatCompletionMessageToolCall(OpenAIObject):
    id: str
    function: Function
@ -269,7 +276,15 @@ class Delta(OpenAIObject):
        self.content = content
        self.role = role
        self.function_call = function_call
-        self.tool_calls = tool_calls
+        if tool_calls is not None:
            if isinstance(tool_calls, dict):
                self.tool_calls = []
                for tool_call in tool_calls:
                    if tool_call.get("index", None) is None:
                        tool_call["index"] = 0
                    self.tool_calls.append(ChatCompletionDeltaToolCall(**tool_call))
            else:
                self.tool_calls = tool_calls
    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -5847,6 +5862,18 @@ async def convert_to_streaming_response_async(response_object: Optional[dict] =
    choice_list = []
    for idx, choice in enumerate(response_object["choices"]):
        if (
            choice["message"].get("tool_calls", None) is not None
            and isinstance(choice["message"]["tool_calls"], list)
            and len(choice["message"]["tool_calls"]) > 0
            and isinstance(choice["message"]["tool_calls"][0], dict)
        ):
            pydantic_tool_calls = []
            for index, t in enumerate(choice["message"]["tool_calls"]):
                if "index" not in t:
                    t["index"] = index
                pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
            choice["message"]["tool_calls"] = pydantic_tool_calls
        delta = Delta(
            content=choice["message"].get("content", None),
            role=choice["message"]["role"],
@ -8646,6 +8673,7 @@ class CustomStreamWrapper:
                    "text": chunk.choices[0].delta.content,
                    "is_finished": True,
                    "finish_reason": chunk.choices[0].finish_reason,
                    "original_chunk": chunk,
                }
                completion_obj["content"] = response_obj["text"]
@ -8676,80 +8704,11 @@ class CustomStreamWrapper:
                    model_response.choices[0].logprobs = response_obj["logprobs"]
            model_response.model = self.model
            print_verbose(
                f"model_response: {model_response}; completion_obj: {completion_obj}"
            )
            print_verbose(
                f"model_response finish reason 3: {model_response.choices[0].finish_reason}"
            )
            ## FUNCTION CALL PARSING
            if (
                len(completion_obj["content"]) > 0
            ):  # cannot set content of an OpenAI Object to be an empty string
                hold, model_response_str = self.check_special_tokens(
                    chunk=completion_obj["content"],
                    finish_reason=model_response.choices[0].finish_reason,
                )  # filter out bos/eos tokens from openai-compatible hf endpoints
                print_verbose(
                    f"hold - {hold}, model_response_str - {model_response_str}"
                )
                if hold is False:
                    ## check if openai/azure chunk
                    original_chunk = response_obj.get("original_chunk", None)
                    if original_chunk:
                        model_response.id = original_chunk.id
                        if len(original_chunk.choices) > 0:
                            try:
                                delta = dict(original_chunk.choices[0].delta)
                                print_verbose(f"original delta: {delta}")
                                model_response.choices[0].delta = Delta(**delta)
                            except Exception as e:
                                model_response.choices[0].delta = Delta()
                        else:
                            return
                        model_response.system_fingerprint = (
                            original_chunk.system_fingerprint
                        )
                        print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
                        if self.sent_first_chunk == False:
                            model_response.choices[0].delta["role"] = "assistant"
                            self.sent_first_chunk = True
                        elif self.sent_first_chunk == True and hasattr(
                            model_response.choices[0].delta, "role"
                        ):
                            _initial_delta = model_response.choices[
                                0
                            ].delta.model_dump()
                            _initial_delta.pop("role", None)
                            model_response.choices[0].delta = Delta(**_initial_delta)
                        print_verbose(
                            f"model_response.choices[0].delta: {model_response.choices[0].delta}"
                        )
                    else:
                        ## else
                        completion_obj["content"] = model_response_str
                        if self.sent_first_chunk == False:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
                    print_verbose(f"returning model_response: {model_response}")
                    return model_response
                else:
                    return
            elif model_response.choices[0].finish_reason:
                # flush any remaining holding chunk
                if len(self.holding_chunk) > 0:
                    if model_response.choices[0].delta.content is None:
                        model_response.choices[0].delta.content = self.holding_chunk
                    else:
                        model_response.choices[0].delta.content = (
                            self.holding_chunk + model_response.choices[0].delta.content
                        )
                    self.holding_chunk = ""
                model_response.choices[0].finish_reason = map_finish_reason(
                    model_response.choices[0].finish_reason
                )  # ensure consistent output to openai
                return model_response
            elif (
                response_obj is not None
                and response_obj.get("original_chunk", None) is not None
            ):  # function / tool calling branch - only set for openai/azure compatible endpoints
@ -8783,26 +8742,75 @@ class CustomStreamWrapper:
                                    original_chunk.choices[0].delta.tool_calls, list
                                ):
                                    for t in original_chunk.choices[0].delta.tool_calls:
-                                        if (
+                                        if hasattr(t, "functions") and hasattr(
-                                            getattr(
+                                            t.functions, "arguments"
                                                t.function,
                                                "arguments",
                                            )
                                            is None
                                        ):
-                                            t.function.arguments = ""
+                                            if (
                                                getattr(
                                                    t.function,
                                                    "arguments",
                                                )
                                                is None
                                            ):
                                                t.function.arguments = ""
                            model_response.choices[0].delta = Delta(**delta)
                        except Exception as e:
                            traceback.print_exc()
                            model_response.choices[0].delta = Delta()
                    else:
-                        return
+                        try:
                            delta = dict(original_chunk.choices[0].delta)
                            print_verbose(f"original delta: {delta}")
                            model_response.choices[0].delta = Delta(**delta)
                        except Exception as e:
                            model_response.choices[0].delta = Delta()
                else:
                    return
                model_response.system_fingerprint = original_chunk.system_fingerprint
                if self.sent_first_chunk == False:
                    model_response.choices[0].delta["role"] = "assistant"
                    self.sent_first_chunk = True
            ## RETURN ARG
            if (
                response_obj.get("text", None) is not None
                or response_obj.get("original_chunk", None) is not None
            ):
                hold = False
                if response_obj.get("content", None) is not None:
                    hold, model_response_str = self.check_special_tokens(
                        chunk=completion_obj["content"],
                        finish_reason=model_response.choices[0].finish_reason,
                    )  # filter out bos/eos tokens from openai-compatible hf endpoints
                    print_verbose(
                        f"hold - {hold}, model_response_str - {model_response_str}"
                    )
                if hold is False:
                    original_chunk = response_obj.get("original_chunk", None)
                    if original_chunk is None:
                        completion_obj["content"] = model_response_str
                        if self.sent_first_chunk == False:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
                    print_verbose(f"returning model_response: {model_response}")
                    return model_response
                else:
                    return
            elif model_response.choices[0].finish_reason is not None:
                # flush any remaining holding chunk
                if len(self.holding_chunk) > 0:
                    if model_response.choices[0].delta.content is None:
                        model_response.choices[0].delta.content = self.holding_chunk
                    else:
                        model_response.choices[0].delta.content = (
                            self.holding_chunk + model_response.choices[0].delta.content
                        )
                    self.holding_chunk = ""
                # get any function call arguments
                model_response.choices[0].finish_reason = map_finish_reason(
                    model_response.choices[0].finish_reason
                )  # ensure consistent output to openai
                return model_response
            else:
                return