diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index 66e8be4cbe..dc1e1a097a 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -1907,6 +1907,8 @@ def test_azure_streaming_and_function_calling():
 
 @pytest.mark.asyncio
 async def test_azure_astreaming_and_function_calling():
+    import uuid
+
     tools = [
         {
             "type": "function",
@@ -1927,7 +1929,20 @@ async def test_azure_astreaming_and_function_calling():
             },
         }
     ]
-    messages = [{"role": "user", "content": "What is the weather like in Boston?"}]
+    messages = [
+        {
+            "role": "user",
+            "content": f"What is the weather like in Boston? {uuid.uuid4()}",
+        }
+    ]
+    from litellm.caching import Cache
+
+    litellm.cache = Cache(
+        type="redis",
+        host=os.environ["REDIS_HOST"],
+        port=os.environ["REDIS_PORT"],
+        password=os.environ["REDIS_PASSWORD"],
+    )
     try:
         response = await litellm.acompletion(
             model="azure/gpt-4-nov-release",
@@ -1938,6 +1953,7 @@ async def test_azure_astreaming_and_function_calling():
             api_base=os.getenv("AZURE_FRANCE_API_BASE"),
             api_key=os.getenv("AZURE_FRANCE_API_KEY"),
             api_version="2024-02-15-preview",
+            caching=True,
         )
         # Add any assertions here to check the response
         idx = 0
@@ -1957,6 +1973,36 @@ async def test_azure_astreaming_and_function_calling():
                 validate_final_streaming_function_calling_chunk(chunk=chunk)
             idx += 1
 
+        ## CACHING TEST
+        print("\n\nCACHING TESTS\n\n")
+        response = await litellm.acompletion(
+            model="azure/gpt-4-nov-release",
+            tools=tools,
+            tool_choice="auto",
+            messages=messages,
+            stream=True,
+            api_base=os.getenv("AZURE_FRANCE_API_BASE"),
+            api_key=os.getenv("AZURE_FRANCE_API_KEY"),
+            api_version="2024-02-15-preview",
+            caching=True,
+        )
+        # Add any assertions here to check the response
+        idx = 0
+        async for chunk in response:
+            print(f"chunk: {chunk}")
+            if idx == 0:
+                assert (
+                    chunk.choices[0].delta.tool_calls[0].function.arguments is not None
+                )
+                assert isinstance(
+                    chunk.choices[0].delta.tool_calls[0].function.arguments, str
+                )
+                validate_first_streaming_function_calling_chunk(chunk=chunk)
+            elif idx == 1:
+                validate_second_streaming_function_calling_chunk(chunk=chunk)
+            elif chunk.choices[0].finish_reason is not None:  # last chunk
+                validate_final_streaming_function_calling_chunk(chunk=chunk)
+            idx += 1
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
         raise e
diff --git a/litellm/utils.py b/litellm/utils.py
index 7de5199b46..ad9b65998f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -213,6 +213,13 @@ class Function(OpenAIObject):
     name: str
 
 
+class ChatCompletionDeltaToolCall(OpenAIObject):
+    id: str
+    function: Function
+    type: str
+    index: int
+
+
 class ChatCompletionMessageToolCall(OpenAIObject):
     id: str
     function: Function
@@ -269,7 +276,15 @@ class Delta(OpenAIObject):
         self.content = content
         self.role = role
         self.function_call = function_call
-        self.tool_calls = tool_calls
+        if tool_calls is not None:
+            if isinstance(tool_calls, dict):
+                self.tool_calls = []
+                for tool_call in tool_calls:
+                    if tool_call.get("index", None) is None:
+                        tool_call["index"] = 0
+                    self.tool_calls.append(ChatCompletionDeltaToolCall(**tool_call))
+            else:
+                self.tool_calls = tool_calls
 
     def __contains__(self, key):
         # Define custom behavior for the 'in' operator
@@ -5847,6 +5862,18 @@ async def convert_to_streaming_response_async(response_object: Optional[dict] =
     choice_list = []
 
     for idx, choice in enumerate(response_object["choices"]):
+        if (
+            choice["message"].get("tool_calls", None) is not None
+            and isinstance(choice["message"]["tool_calls"], list)
+            and len(choice["message"]["tool_calls"]) > 0
+            and isinstance(choice["message"]["tool_calls"][0], dict)
+        ):
+            pydantic_tool_calls = []
+            for index, t in enumerate(choice["message"]["tool_calls"]):
+                if "index" not in t:
+                    t["index"] = index
+                pydantic_tool_calls.append(ChatCompletionDeltaToolCall(**t))
+            choice["message"]["tool_calls"] = pydantic_tool_calls
         delta = Delta(
             content=choice["message"].get("content", None),
             role=choice["message"]["role"],
@@ -8646,6 +8673,7 @@ class CustomStreamWrapper:
                     "text": chunk.choices[0].delta.content,
                     "is_finished": True,
                     "finish_reason": chunk.choices[0].finish_reason,
+                    "original_chunk": chunk,
                 }
 
                 completion_obj["content"] = response_obj["text"]
@@ -8676,80 +8704,11 @@ class CustomStreamWrapper:
                     model_response.choices[0].logprobs = response_obj["logprobs"]
 
             model_response.model = self.model
-            print_verbose(
-                f"model_response: {model_response}; completion_obj: {completion_obj}"
-            )
             print_verbose(
                 f"model_response finish reason 3: {model_response.choices[0].finish_reason}"
             )
+            ## FUNCTION CALL PARSING
             if (
-                len(completion_obj["content"]) > 0
-            ):  # cannot set content of an OpenAI Object to be an empty string
-                hold, model_response_str = self.check_special_tokens(
-                    chunk=completion_obj["content"],
-                    finish_reason=model_response.choices[0].finish_reason,
-                )  # filter out bos/eos tokens from openai-compatible hf endpoints
-                print_verbose(
-                    f"hold - {hold}, model_response_str - {model_response_str}"
-                )
-                if hold is False:
-                    ## check if openai/azure chunk
-                    original_chunk = response_obj.get("original_chunk", None)
-                    if original_chunk:
-                        model_response.id = original_chunk.id
-                        if len(original_chunk.choices) > 0:
-                            try:
-                                delta = dict(original_chunk.choices[0].delta)
-                                print_verbose(f"original delta: {delta}")
-                                model_response.choices[0].delta = Delta(**delta)
-                            except Exception as e:
-                                model_response.choices[0].delta = Delta()
-                        else:
-                            return
-                        model_response.system_fingerprint = (
-                            original_chunk.system_fingerprint
-                        )
-                        print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-                        if self.sent_first_chunk == False:
-                            model_response.choices[0].delta["role"] = "assistant"
-                            self.sent_first_chunk = True
-                        elif self.sent_first_chunk == True and hasattr(
-                            model_response.choices[0].delta, "role"
-                        ):
-                            _initial_delta = model_response.choices[
-                                0
-                            ].delta.model_dump()
-                            _initial_delta.pop("role", None)
-                            model_response.choices[0].delta = Delta(**_initial_delta)
-                        print_verbose(
-                            f"model_response.choices[0].delta: {model_response.choices[0].delta}"
-                        )
-                    else:
-                        ## else
-                        completion_obj["content"] = model_response_str
-                        if self.sent_first_chunk == False:
-                            completion_obj["role"] = "assistant"
-                            self.sent_first_chunk = True
-                        model_response.choices[0].delta = Delta(**completion_obj)
-                    print_verbose(f"returning model_response: {model_response}")
-                    return model_response
-                else:
-                    return
-            elif model_response.choices[0].finish_reason:
-                # flush any remaining holding chunk
-                if len(self.holding_chunk) > 0:
-                    if model_response.choices[0].delta.content is None:
-                        model_response.choices[0].delta.content = self.holding_chunk
-                    else:
-                        model_response.choices[0].delta.content = (
-                            self.holding_chunk + model_response.choices[0].delta.content
-                        )
-                    self.holding_chunk = ""
-                model_response.choices[0].finish_reason = map_finish_reason(
-                    model_response.choices[0].finish_reason
-                )  # ensure consistent output to openai
-                return model_response
-            elif (
                 response_obj is not None
                 and response_obj.get("original_chunk", None) is not None
             ):  # function / tool calling branch - only set for openai/azure compatible endpoints
@@ -8783,26 +8742,75 @@ class CustomStreamWrapper:
                                     original_chunk.choices[0].delta.tool_calls, list
                                 ):
                                     for t in original_chunk.choices[0].delta.tool_calls:
-                                        if (
-                                            getattr(
-                                                t.function,
-                                                "arguments",
-                                            )
-                                            is None
+                                        if hasattr(t, "functions") and hasattr(
+                                            t.functions, "arguments"
                                         ):
-                                            t.function.arguments = ""
+                                            if (
+                                                getattr(
+                                                    t.function,
+                                                    "arguments",
+                                                )
+                                                is None
+                                            ):
+                                                t.function.arguments = ""
                             model_response.choices[0].delta = Delta(**delta)
                         except Exception as e:
                             traceback.print_exc()
                             model_response.choices[0].delta = Delta()
                     else:
-                        return
+                        try:
+                            delta = dict(original_chunk.choices[0].delta)
+                            print_verbose(f"original delta: {delta}")
+                            model_response.choices[0].delta = Delta(**delta)
+                        except Exception as e:
+                            model_response.choices[0].delta = Delta()
                 else:
                     return
                 model_response.system_fingerprint = original_chunk.system_fingerprint
                 if self.sent_first_chunk == False:
                     model_response.choices[0].delta["role"] = "assistant"
                     self.sent_first_chunk = True
+
+            ## RETURN ARG
+            if (
+                response_obj.get("text", None) is not None
+                or response_obj.get("original_chunk", None) is not None
+            ):
+                hold = False
+                if response_obj.get("content", None) is not None:
+                    hold, model_response_str = self.check_special_tokens(
+                        chunk=completion_obj["content"],
+                        finish_reason=model_response.choices[0].finish_reason,
+                    )  # filter out bos/eos tokens from openai-compatible hf endpoints
+                    print_verbose(
+                        f"hold - {hold}, model_response_str - {model_response_str}"
+                    )
+                if hold is False:
+                    original_chunk = response_obj.get("original_chunk", None)
+                    if original_chunk is None:
+                        completion_obj["content"] = model_response_str
+                        if self.sent_first_chunk == False:
+                            completion_obj["role"] = "assistant"
+                            self.sent_first_chunk = True
+                        model_response.choices[0].delta = Delta(**completion_obj)
+                    print_verbose(f"returning model_response: {model_response}")
+                    return model_response
+                else:
+                    return
+            elif model_response.choices[0].finish_reason is not None:
+                # flush any remaining holding chunk
+                if len(self.holding_chunk) > 0:
+                    if model_response.choices[0].delta.content is None:
+                        model_response.choices[0].delta.content = self.holding_chunk
+                    else:
+                        model_response.choices[0].delta.content = (
+                            self.holding_chunk + model_response.choices[0].delta.content
+                        )
+                    self.holding_chunk = ""
+                # get any function call arguments
+                model_response.choices[0].finish_reason = map_finish_reason(
+                    model_response.choices[0].finish_reason
+                )  # ensure consistent output to openai
                 return model_response
             else:
                 return