diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md
index 0c7b2a442..290e094d0 100644
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@@ -864,3 +864,96 @@ Human: How do I boil water?
 Assistant:
 ```
 
+
+## Usage - PDF 
+
+Pass base64 encoded PDF files to Anthropic models using the `image_url` field.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+### **using base64**
+```python
+from litellm import completion, supports_pdf_input
+import base64
+import requests
+
+# URL of the file
+url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+
+# Download the file
+response = requests.get(url)
+file_data = response.content
+
+encoded_file = base64.b64encode(file_data).decode("utf-8")
+
+## check if model supports pdf input - (2024/11/11) only claude-3-5-haiku-20241022 supports it
+supports_pdf_input("anthropic/claude-3-5-haiku-20241022") # True
+
+response = completion(
+    model="anthropic/claude-3-5-haiku-20241022",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."},
+                {
+                    "type": "image_url",
+                    "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF
+                },
+            ],
+        }
+    ],
+    max_tokens=300,
+)
+
+print(response.choices[0])
+```
+</TabItem>
+<TabItem value="proxy" lable="PROXY">
+
+1. Add model to config 
+
+```yaml
+- model_name: claude-3-5-haiku-20241022
+  litellm_params:
+    model: anthropic/claude-3-5-haiku-20241022
+    api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl http://0.0.0.0:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer <YOUR-LITELLM-KEY>" \
+  -d '{
+    "model": "claude-3-5-haiku-20241022",
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "You are a very professional document summarization specialist. Please summarize the given document"
+          },
+          {
+                "type": "image_url",
+                "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 300
+  }'
+
+```
+</TabItem>
+</Tabs>
diff --git a/litellm/__init__.py b/litellm/__init__.py
index b739afb93..9812de1d8 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -375,6 +375,7 @@ open_ai_text_completion_models: List = []
 cohere_models: List = []
 cohere_chat_models: List = []
 mistral_chat_models: List = []
+text_completion_codestral_models: List = []
 anthropic_models: List = []
 empower_models: List = []
 openrouter_models: List = []
@@ -401,6 +402,19 @@ deepinfra_models: List = []
 perplexity_models: List = []
 watsonx_models: List = []
 gemini_models: List = []
+xai_models: List = []
+deepseek_models: List = []
+azure_ai_models: List = []
+voyage_models: List = []
+databricks_models: List = []
+cloudflare_models: List = []
+codestral_models: List = []
+friendliai_models: List = []
+palm_models: List = []
+groq_models: List = []
+azure_models: List = []
+anyscale_models: List = []
+cerebras_models: List = []
 
 
 def add_known_models():
@@ -477,6 +491,34 @@ def add_known_models():
             # ignore the 'up-to', '-to-' model names -> not real models. just for cost tracking based on model params.
             if "-to-" not in key:
                 fireworks_ai_embedding_models.append(key)
+        elif value.get("litellm_provider") == "text-completion-codestral":
+            text_completion_codestral_models.append(key)
+        elif value.get("litellm_provider") == "xai":
+            xai_models.append(key)
+        elif value.get("litellm_provider") == "deepseek":
+            deepseek_models.append(key)
+        elif value.get("litellm_provider") == "azure_ai":
+            azure_ai_models.append(key)
+        elif value.get("litellm_provider") == "voyage":
+            voyage_models.append(key)
+        elif value.get("litellm_provider") == "databricks":
+            databricks_models.append(key)
+        elif value.get("litellm_provider") == "cloudflare":
+            cloudflare_models.append(key)
+        elif value.get("litellm_provider") == "codestral":
+            codestral_models.append(key)
+        elif value.get("litellm_provider") == "friendliai":
+            friendliai_models.append(key)
+        elif value.get("litellm_provider") == "palm":
+            palm_models.append(key)
+        elif value.get("litellm_provider") == "groq":
+            groq_models.append(key)
+        elif value.get("litellm_provider") == "azure":
+            azure_models.append(key)
+        elif value.get("litellm_provider") == "anyscale":
+            anyscale_models.append(key)
+        elif value.get("litellm_provider") == "cerebras":
+            cerebras_models.append(key)
 
 
 add_known_models()
@@ -722,6 +764,20 @@ model_list = (
     + vertex_language_models
     + watsonx_models
     + gemini_models
+    + text_completion_codestral_models
+    + xai_models
+    + deepseek_models
+    + azure_ai_models
+    + voyage_models
+    + databricks_models
+    + cloudflare_models
+    + codestral_models
+    + friendliai_models
+    + palm_models
+    + groq_models
+    + azure_models
+    + anyscale_models
+    + cerebras_models
 )
 
 
@@ -778,6 +834,7 @@ class LlmProviders(str, Enum):
     FIREWORKS_AI = "fireworks_ai"
     FRIENDLIAI = "friendliai"
     WATSONX = "watsonx"
+    WATSONX_TEXT = "watsonx_text"
     TRITON = "triton"
     PREDIBASE = "predibase"
     DATABRICKS = "databricks"
@@ -794,6 +851,7 @@ provider_list: List[Union[LlmProviders, str]] = list(LlmProviders)
 
 models_by_provider: dict = {
     "openai": open_ai_chat_completion_models + open_ai_text_completion_models,
+    "text-completion-openai": open_ai_text_completion_models,
     "cohere": cohere_models + cohere_chat_models,
     "cohere_chat": cohere_chat_models,
     "anthropic": anthropic_models,
@@ -817,6 +875,23 @@ models_by_provider: dict = {
     "watsonx": watsonx_models,
     "gemini": gemini_models,
     "fireworks_ai": fireworks_ai_models + fireworks_ai_embedding_models,
+    "aleph_alpha": aleph_alpha_models,
+    "text-completion-codestral": text_completion_codestral_models,
+    "xai": xai_models,
+    "deepseek": deepseek_models,
+    "mistral": mistral_chat_models,
+    "azure_ai": azure_ai_models,
+    "voyage": voyage_models,
+    "databricks": databricks_models,
+    "cloudflare": cloudflare_models,
+    "codestral": codestral_models,
+    "nlp_cloud": nlp_cloud_models,
+    "friendliai": friendliai_models,
+    "palm": palm_models,
+    "groq": groq_models,
+    "azure": azure_models,
+    "anyscale": anyscale_models,
+    "cerebras": cerebras_models,
 }
 
 # mapping for those models which have larger equivalents
@@ -889,7 +964,6 @@ from .utils import (
     supports_system_messages,
     get_litellm_params,
     acreate,
-    get_model_list,
     get_max_tokens,
     get_model_info,
     register_prompt_template,
diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py
index 8102f2c60..30a280e57 100644
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@@ -2,14 +2,16 @@ import os
 from dataclasses import dataclass
 from datetime import datetime
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.types.services import ServiceLoggerPayload
 from litellm.types.utils import (
+    ChatCompletionMessageToolCall,
     EmbeddingResponse,
+    Function,
     ImageResponse,
     ModelResponse,
     StandardLoggingPayload,
@@ -403,6 +405,28 @@ class OpenTelemetry(CustomLogger):
         except Exception:
             return ""
 
+    @staticmethod
+    def _tool_calls_kv_pair(
+        tool_calls: List[ChatCompletionMessageToolCall],
+    ) -> Dict[str, Any]:
+        from litellm.proxy._types import SpanAttributes
+
+        kv_pairs: Dict[str, Any] = {}
+        for idx, tool_call in enumerate(tool_calls):
+            _function = tool_call.get("function")
+            if not _function:
+                continue
+
+            keys = Function.__annotations__.keys()
+            for key in keys:
+                _value = _function.get(key)
+                if _value:
+                    kv_pairs[
+                        f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.{key}"
+                    ] = _value
+
+        return kv_pairs
+
     def set_attributes(  # noqa: PLR0915
         self, span: Span, kwargs, response_obj: Optional[Any]
     ):
@@ -597,18 +621,13 @@ class OpenTelemetry(CustomLogger):
                             message = choice.get("message")
                             tool_calls = message.get("tool_calls")
                             if tool_calls:
-                                self.safe_set_attribute(
-                                    span=span,
-                                    key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name",
-                                    value=tool_calls[0].get("function").get("name"),
-                                )
-                                self.safe_set_attribute(
-                                    span=span,
-                                    key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments",
-                                    value=tool_calls[0]
-                                    .get("function")
-                                    .get("arguments"),
-                                )
+                                kv_pairs = OpenTelemetry._tool_calls_kv_pair(tool_calls)  # type: ignore
+                                for key, value in kv_pairs.items():
+                                    self.safe_set_attribute(
+                                        span=span,
+                                        key=key,
+                                        value=value,
+                                    )
 
         except Exception as e:
             verbose_logger.exception(
diff --git a/litellm/llms/anthropic/chat/handler.py b/litellm/llms/anthropic/chat/handler.py
index da95ac075..2d119a28f 100644
--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@@ -71,11 +71,12 @@ def validate_environment(
 
     prompt_caching_set = AnthropicConfig().is_cache_control_set(messages=messages)
     computer_tool_used = AnthropicConfig().is_computer_tool_used(tools=tools)
-
+    pdf_used = AnthropicConfig().is_pdf_used(messages=messages)
     headers = AnthropicConfig().get_anthropic_headers(
         anthropic_version=anthropic_version,
         computer_tool_used=computer_tool_used,
         prompt_caching_set=prompt_caching_set,
+        pdf_used=pdf_used,
         api_key=api_key,
     )
 
diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py
index ec3285473..18c53b696 100644
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@@ -104,6 +104,7 @@ class AnthropicConfig:
         anthropic_version: Optional[str] = None,
         computer_tool_used: bool = False,
         prompt_caching_set: bool = False,
+        pdf_used: bool = False,
     ) -> dict:
         import json
 
@@ -112,6 +113,8 @@ class AnthropicConfig:
             betas.append("prompt-caching-2024-07-31")
         if computer_tool_used:
             betas.append("computer-use-2024-10-22")
+        if pdf_used:
+            betas.append("pdfs-2024-09-25")
         headers = {
             "anthropic-version": anthropic_version or "2023-06-01",
             "x-api-key": api_key,
@@ -365,6 +368,21 @@ class AnthropicConfig:
                 return True
         return False
 
+    def is_pdf_used(self, messages: List[AllMessageValues]) -> bool:
+        """
+        Set to true if media passed into messages.
+        """
+        for message in messages:
+            if (
+                "content" in message
+                and message["content"] is not None
+                and isinstance(message["content"], list)
+            ):
+                for content in message["content"]:
+                    if "type" in content:
+                        return True
+        return False
+
     def translate_system_message(
         self, messages: List[AllMessageValues]
     ) -> List[AnthropicSystemMessageContent]:
diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index aee304760..80ad2ca35 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -1330,7 +1330,10 @@ def convert_to_anthropic_tool_invoke(
 
 def add_cache_control_to_content(
     anthropic_content_element: Union[
-        dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
+        dict,
+        AnthropicMessagesImageParam,
+        AnthropicMessagesTextParam,
+        AnthropicMessagesDocumentParam,
     ],
     orignal_content_element: Union[dict, AllMessageValues],
 ):
@@ -1343,6 +1346,32 @@ def add_cache_control_to_content(
     return anthropic_content_element
 
 
+def _anthropic_content_element_factory(
+    image_chunk: GenericImageParsingChunk,
+) -> Union[AnthropicMessagesImageParam, AnthropicMessagesDocumentParam]:
+    if image_chunk["media_type"] == "application/pdf":
+        _anthropic_content_element: Union[
+            AnthropicMessagesDocumentParam, AnthropicMessagesImageParam
+        ] = AnthropicMessagesDocumentParam(
+            type="document",
+            source=AnthropicContentParamSource(
+                type="base64",
+                media_type=image_chunk["media_type"],
+                data=image_chunk["data"],
+            ),
+        )
+    else:
+        _anthropic_content_element = AnthropicMessagesImageParam(
+            type="image",
+            source=AnthropicContentParamSource(
+                type="base64",
+                media_type=image_chunk["media_type"],
+                data=image_chunk["data"],
+            ),
+        )
+    return _anthropic_content_element
+
+
 def anthropic_messages_pt(  # noqa: PLR0915
     messages: List[AllMessageValues],
     model: str,
@@ -1400,15 +1429,9 @@ def anthropic_messages_pt(  # noqa: PLR0915
                                     openai_image_url=m["image_url"]["url"]
                                 )
 
-                            _anthropic_content_element = AnthropicMessagesImageParam(
-                                type="image",
-                                source=AnthropicImageParamSource(
-                                    type="base64",
-                                    media_type=image_chunk["media_type"],
-                                    data=image_chunk["data"],
-                                ),
+                            _anthropic_content_element = (
+                                _anthropic_content_element_factory(image_chunk)
                             )
-
                             _content_element = add_cache_control_to_content(
                                 anthropic_content_element=_anthropic_content_element,
                                 orignal_content_element=dict(m),
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index e8aeac2cb..48b25523e 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1898,7 +1898,8 @@
         "supports_function_calling": true,
         "tool_use_system_prompt_tokens": 264,
         "supports_assistant_prefill": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_pdf_input": true
     },
     "claude-3-opus-20240229": {
         "max_tokens": 4096,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index c44a46a67..cd723275b 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,63 +1,7 @@
 model_list:
-  - model_name: claude-3-5-sonnet-20240620
+  - model_name: "*"
     litellm_params:
-      model: claude-3-5-sonnet-20240620
-      api_key: os.environ/ANTHROPIC_API_KEY
-  - model_name: claude-3-5-sonnet-aihubmix
-    litellm_params:
-      model: openai/claude-3-5-sonnet-20240620
-      input_cost_per_token: 0.000003 # 3$/M
-      output_cost_per_token: 0.000015 # 15$/M
-      api_base: "https://exampleopenaiendpoint-production.up.railway.app"
-      api_key: my-fake-key
-  - model_name: fake-openai-endpoint-2
-    litellm_params:
-      model: openai/my-fake-model
-      api_key: my-fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-      stream_timeout: 0.001
-      timeout: 1
-      rpm: 1
-  - model_name: fake-openai-endpoint
-    litellm_params:
-      model: openai/my-fake-model
-      api_key: my-fake-key
-      api_base: https://exampleopenaiendpoint-production.up.railway.app/
-  ## bedrock chat completions
-  - model_name: "*anthropic.claude*"
-    litellm_params:
-      model: bedrock/*anthropic.claude*
-      aws_access_key_id: os.environ/BEDROCK_AWS_ACCESS_KEY_ID
-      aws_secret_access_key: os.environ/BEDROCK_AWS_SECRET_ACCESS_KEY
-      aws_region_name: os.environ/AWS_REGION_NAME
-      guardrailConfig:
-        "guardrailIdentifier": "h4dsqwhp6j66"
-        "guardrailVersion": "2"
-        "trace": "enabled"
-        
-## bedrock embeddings
-  - model_name: "*amazon.titan-embed-*"
-    litellm_params:
-      model: bedrock/amazon.titan-embed-*
-      aws_access_key_id: os.environ/BEDROCK_AWS_ACCESS_KEY_ID
-      aws_secret_access_key: os.environ/BEDROCK_AWS_SECRET_ACCESS_KEY
-      aws_region_name: os.environ/AWS_REGION_NAME
-  - model_name: "*cohere.embed-*"
-    litellm_params:
-      model: bedrock/cohere.embed-*
-      aws_access_key_id: os.environ/BEDROCK_AWS_ACCESS_KEY_ID
-      aws_secret_access_key: os.environ/BEDROCK_AWS_SECRET_ACCESS_KEY
-      aws_region_name: os.environ/AWS_REGION_NAME
-
-  - model_name: gpt-4
-    litellm_params:
-      model: azure/chatgpt-v-2
-      api_base: https://openai-gpt-4-test-v-1.openai.azure.com/
-      api_version: "2023-05-15"
-      api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault
-      rpm: 480
-      timeout: 300
-      stream_timeout: 60
+      model: "*"
 
 litellm_settings:
   fallbacks: [{ "claude-3-5-sonnet-20240620": ["claude-3-5-sonnet-aihubmix"] }]
diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py
index ff1acc3c9..6032a72af 100644
--- a/litellm/proxy/auth/user_api_key_auth.py
+++ b/litellm/proxy/auth/user_api_key_auth.py
@@ -1236,7 +1236,6 @@ def _return_user_api_key_auth_obj(
     start_time: datetime,
     user_role: Optional[LitellmUserRoles] = None,
 ) -> UserAPIKeyAuth:
-    traceback.print_stack()
     end_time = datetime.now()
     user_api_key_service_logger_obj.service_success_hook(
         service=ServiceTypes.AUTH,
diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py
index bb65a372d..b0a3780b8 100644
--- a/litellm/types/llms/anthropic.py
+++ b/litellm/types/llms/anthropic.py
@@ -74,7 +74,7 @@ class AnthopicMessagesAssistantMessageParam(TypedDict, total=False):
     """
 
 
-class AnthropicImageParamSource(TypedDict):
+class AnthropicContentParamSource(TypedDict):
     type: Literal["base64"]
     media_type: str
     data: str
@@ -82,7 +82,13 @@ class AnthropicImageParamSource(TypedDict):
 
 class AnthropicMessagesImageParam(TypedDict, total=False):
     type: Required[Literal["image"]]
-    source: Required[AnthropicImageParamSource]
+    source: Required[AnthropicContentParamSource]
+    cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
+
+
+class AnthropicMessagesDocumentParam(TypedDict, total=False):
+    type: Required[Literal["document"]]
+    source: Required[AnthropicContentParamSource]
     cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
 
 
@@ -108,6 +114,7 @@ AnthropicMessagesUserMessageValues = Union[
     AnthropicMessagesTextParam,
     AnthropicMessagesImageParam,
     AnthropicMessagesToolResultParam,
+    AnthropicMessagesDocumentParam,
 ]
 
 
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index c0a9764e8..a2b62f9cc 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -1322,11 +1322,6 @@ class TranscriptionResponse(OpenAIObject):
 
 
 class GenericImageParsingChunk(TypedDict):
-    # {
-    #         "type": "base64",
-    #         "media_type": f"image/{image_format}",
-    #         "data": base64_data,
-    #     }
     type: str
     media_type: str
     data: str
diff --git a/litellm/utils.py b/litellm/utils.py
index d07d86f7d..b10c94859 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1835,6 +1835,13 @@ def supports_audio_input(model: str, custom_llm_provider: Optional[str] = None)
     )
 
 
+def supports_pdf_input(model: str, custom_llm_provider: Optional[str] = None) -> bool:
+    """Check if a given model supports pdf input in a chat completion call"""
+    return _supports_factory(
+        model=model, custom_llm_provider=custom_llm_provider, key="supports_pdf_input"
+    )
+
+
 def supports_audio_output(
     model: str, custom_llm_provider: Optional[str] = None
 ) -> bool:
@@ -5420,2121 +5427,6 @@ def register_prompt_template(
     return litellm.custom_prompt_dict
 
 
-####### DEPRECATED ################
-
-
-def get_all_keys(llm_provider=None):
-    try:
-        global last_fetched_at_keys
-        # if user is using hosted product -> instantiate their env with their hosted api keys - refresh every 5 minutes
-        print_verbose(f"Reaches get all keys, llm_provider: {llm_provider}")
-        user_email = (
-            os.getenv("LITELLM_EMAIL")
-            or litellm.email
-            or litellm.token
-            or os.getenv("LITELLM_TOKEN")
-        )
-        if user_email:
-            time_delta = 0
-            if last_fetched_at_keys is not None:
-                current_time = time.time()
-                time_delta = current_time - last_fetched_at_keys
-            if (
-                time_delta > 300 or last_fetched_at_keys is None or llm_provider
-            ):  # if the llm provider is passed in , assume this happening due to an AuthError for that provider
-                # make the api call
-                last_fetched_at = time.time()
-                print_verbose(f"last_fetched_at: {last_fetched_at}")
-                response = requests.post(
-                    url="http://api.litellm.ai/get_all_keys",
-                    headers={"content-type": "application/json"},
-                    data=json.dumps({"user_email": user_email}),
-                )
-                print_verbose(f"get model key response: {response.text}")
-                data = response.json()
-                # update model list
-                for key, value in data[
-                    "model_keys"
-                ].items():  # follows the LITELLM API KEY format - <UPPERCASE_PROVIDER_NAME>_API_KEY - e.g. HUGGINGFACE_API_KEY
-                    os.environ[key] = value
-                # set model alias map
-                for model_alias, value in data["model_alias_map"].items():
-                    litellm.model_alias_map[model_alias] = value
-                return "it worked!"
-            return None
-        return None
-    except Exception:
-        print_verbose(
-            f"[Non-Blocking Error] get_all_keys error - {traceback.format_exc()}"
-        )
-        pass
-
-
-def get_model_list():
-    global last_fetched_at, print_verbose
-    try:
-        # if user is using hosted product -> get their updated model list
-        user_email = (
-            os.getenv("LITELLM_EMAIL")
-            or litellm.email
-            or litellm.token
-            or os.getenv("LITELLM_TOKEN")
-        )
-        if user_email:
-            # make the api call
-            last_fetched_at = time.time()
-            print_verbose(f"last_fetched_at: {last_fetched_at}")
-            response = requests.post(
-                url="http://api.litellm.ai/get_model_list",
-                headers={"content-type": "application/json"},
-                data=json.dumps({"user_email": user_email}),
-            )
-            print_verbose(f"get_model_list response: {response.text}")
-            data = response.json()
-            # update model list
-            model_list = data["model_list"]
-            # # check if all model providers are in environment
-            # model_providers = data["model_providers"]
-            # missing_llm_provider = None
-            # for item in model_providers:
-            #     if f"{item.upper()}_API_KEY" not in os.environ:
-            #         missing_llm_provider = item
-            #         break
-            # # update environment - if required
-            # threading.Thread(target=get_all_keys, args=(missing_llm_provider)).start()
-            return model_list
-        return []  # return empty list by default
-    except Exception:
-        print_verbose(
-            f"[Non-Blocking Error] get_model_list error - {traceback.format_exc()}"
-        )
-
-
-######## Streaming Class ############################
-# wraps the completion stream to return the correct format for the model
-# replicate/anthropic/cohere
-
-# class CustomStreamWrapper:
-#     def __init__(
-#         self,
-#         completion_stream,
-#         model,
-#         logging_obj: Any,
-#         custom_llm_provider: Optional[str] = None,
-#         stream_options=None,
-#         make_call: Optional[Callable] = None,
-#         _response_headers: Optional[dict] = None,
-#     ):
-#         self.model = model
-#         self.make_call = make_call
-#         self.custom_llm_provider = custom_llm_provider
-#         self.logging_obj: LiteLLMLoggingObject = logging_obj
-#         self.completion_stream = completion_stream
-#         self.sent_first_chunk = False
-#         self.sent_last_chunk = False
-#         self.system_fingerprint: Optional[str] = None
-#         self.received_finish_reason: Optional[str] = None
-#         self.special_tokens = [
-#             "<|assistant|>",
-#             "<|system|>",
-#             "<|user|>",
-#             "<s>",
-#             "</s>",
-#             "<|im_end|>",
-#             "<|im_start|>",
-#         ]
-#         self.holding_chunk = ""
-#         self.complete_response = ""
-#         self.response_uptil_now = ""
-#         _model_info = (
-#             self.logging_obj.model_call_details.get("litellm_params", {}).get(
-#                 "model_info", {}
-#             )
-#             or {}
-#         )
-#         self._hidden_params = {
-#             "model_id": (_model_info.get("id", None)),
-#         }  # returned as x-litellm-model-id response header in proxy
-
-#         self._hidden_params["additional_headers"] = process_response_headers(
-#             _response_headers or {}
-#         )  # GUARANTEE OPENAI HEADERS IN RESPONSE
-
-#         self._response_headers = _response_headers
-#         self.response_id = None
-#         self.logging_loop = None
-#         self.rules = Rules()
-#         self.stream_options = stream_options or getattr(
-#             logging_obj, "stream_options", None
-#         )
-#         self.messages = getattr(logging_obj, "messages", None)
-#         self.sent_stream_usage = False
-#         self.send_stream_usage = (
-#             True if self.check_send_stream_usage(self.stream_options) else False
-#         )
-#         self.tool_call = False
-#         self.chunks: List = (
-#             []
-#         )  # keep track of the returned chunks - used for calculating the input/output tokens for stream options
-#         self.is_function_call = self.check_is_function_call(logging_obj=logging_obj)
-
-#     def __iter__(self):
-#         return self
-
-#     def __aiter__(self):
-#         return self
-
-#     def check_send_stream_usage(self, stream_options: Optional[dict]):
-#         return (
-#             stream_options is not None
-#             and stream_options.get("include_usage", False) is True
-#         )
-
-#     def check_is_function_call(self, logging_obj) -> bool:
-#         if hasattr(logging_obj, "optional_params") and isinstance(
-#             logging_obj.optional_params, dict
-#         ):
-#             if (
-#                 "litellm_param_is_function_call" in logging_obj.optional_params
-#                 and logging_obj.optional_params["litellm_param_is_function_call"]
-#                 is True
-#             ):
-#                 return True
-
-#         return False
-
-#     def process_chunk(self, chunk: str):
-#         """
-#         NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta.
-#         """
-#         try:
-#             chunk = chunk.strip()
-#             self.complete_response = self.complete_response.strip()
-
-#             if chunk.startswith(self.complete_response):
-#                 # Remove last_sent_chunk only if it appears at the start of the new chunk
-#                 chunk = chunk[len(self.complete_response) :]
-
-#             self.complete_response += chunk
-#             return chunk
-#         except Exception as e:
-#             raise e
-
-#     def safety_checker(self) -> None:
-#         """
-#         Fixes - https://github.com/BerriAI/litellm/issues/5158
-
-#         if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries.
-
-#         Raises - InternalServerError, if LLM enters infinite loop while streaming
-#         """
-#         if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT:
-#             # Get the last n chunks
-#             last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :]
-
-#             # Extract the relevant content from the chunks
-#             last_contents = [chunk.choices[0].delta.content for chunk in last_chunks]
-
-#             # Check if all extracted contents are identical
-#             if all(content == last_contents[0] for content in last_contents):
-#                 if (
-#                     last_contents[0] is not None
-#                     and isinstance(last_contents[0], str)
-#                     and len(last_contents[0]) > 2
-#                 ):  # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946
-#                     # All last n chunks are identical
-#                     raise litellm.InternalServerError(
-#                         message="The model is repeating the same chunk = {}.".format(
-#                             last_contents[0]
-#                         ),
-#                         model="",
-#                         llm_provider="",
-#                     )
-
-#     def check_special_tokens(self, chunk: str, finish_reason: Optional[str]):
-#         """
-#         Output parse <s> / </s> special tokens for sagemaker + hf streaming.
-#         """
-#         hold = False
-#         if (
-#             self.custom_llm_provider != "huggingface"
-#             and self.custom_llm_provider != "sagemaker"
-#         ):
-#             return hold, chunk
-
-#         if finish_reason:
-#             for token in self.special_tokens:
-#                 if token in chunk:
-#                     chunk = chunk.replace(token, "")
-#             return hold, chunk
-
-#         if self.sent_first_chunk is True:
-#             return hold, chunk
-
-#         curr_chunk = self.holding_chunk + chunk
-#         curr_chunk = curr_chunk.strip()
-
-#         for token in self.special_tokens:
-#             if len(curr_chunk) < len(token) and curr_chunk in token:
-#                 hold = True
-#                 self.holding_chunk = curr_chunk
-#             elif len(curr_chunk) >= len(token):
-#                 if token in curr_chunk:
-#                     self.holding_chunk = curr_chunk.replace(token, "")
-#                     hold = True
-#             else:
-#                 pass
-
-#         if hold is False:  # reset
-#             self.holding_chunk = ""
-#         return hold, curr_chunk
-
-#     def handle_anthropic_text_chunk(self, chunk):
-#         """
-#         For old anthropic models - claude-1, claude-2.
-
-#         Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator()
-#         """
-#         str_line = chunk
-#         if isinstance(chunk, bytes):  # Handle binary data
-#             str_line = chunk.decode("utf-8")  # Convert bytes to string
-#         text = ""
-#         is_finished = False
-#         finish_reason = None
-#         if str_line.startswith("data:"):
-#             data_json = json.loads(str_line[5:])
-#             type_chunk = data_json.get("type", None)
-#             if type_chunk == "completion":
-#                 text = data_json.get("completion")
-#                 finish_reason = data_json.get("stop_reason")
-#                 if finish_reason is not None:
-#                     is_finished = True
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         elif "error" in str_line:
-#             raise ValueError(f"Unable to parse response. Original response: {str_line}")
-#         else:
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-
-#     def handle_vertexai_anthropic_chunk(self, chunk):
-#         """
-#         - MessageStartEvent(message=Message(id='msg_01LeRRgvX4gwkX3ryBVgtuYZ', content=[], model='claude-3-sonnet-20240229', role='assistant', stop_reason=None, stop_sequence=None, type='message', usage=Usage(input_tokens=8, output_tokens=1)), type='message_start'); custom_llm_provider: vertex_ai
-#         - ContentBlockStartEvent(content_block=ContentBlock(text='', type='text'), index=0, type='content_block_start'); custom_llm_provider: vertex_ai
-#         - ContentBlockDeltaEvent(delta=TextDelta(text='Hello', type='text_delta'), index=0, type='content_block_delta'); custom_llm_provider: vertex_ai
-#         """
-#         text = ""
-#         prompt_tokens = None
-#         completion_tokens = None
-#         is_finished = False
-#         finish_reason = None
-#         type_chunk = getattr(chunk, "type", None)
-#         if type_chunk == "message_start":
-#             message = getattr(chunk, "message", None)
-#             text = ""  # lets us return a chunk with usage to user
-#             _usage = getattr(message, "usage", None)
-#             if _usage is not None:
-#                 prompt_tokens = getattr(_usage, "input_tokens", None)
-#                 completion_tokens = getattr(_usage, "output_tokens", None)
-#         elif type_chunk == "content_block_delta":
-#             """
-#             Anthropic content chunk
-#             chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
-#             """
-#             delta = getattr(chunk, "delta", None)
-#             if delta is not None:
-#                 text = getattr(delta, "text", "")
-#             else:
-#                 text = ""
-#         elif type_chunk == "message_delta":
-#             """
-#             Anthropic
-#             chunk = {'type': 'message_delta', 'delta': {'stop_reason': 'max_tokens', 'stop_sequence': None}, 'usage': {'output_tokens': 10}}
-#             """
-#             # TODO - get usage from this chunk, set in response
-#             delta = getattr(chunk, "delta", None)
-#             if delta is not None:
-#                 finish_reason = getattr(delta, "stop_reason", "stop")
-#                 is_finished = True
-#             _usage = getattr(chunk, "usage", None)
-#             if _usage is not None:
-#                 prompt_tokens = getattr(_usage, "input_tokens", None)
-#                 completion_tokens = getattr(_usage, "output_tokens", None)
-
-#         return {
-#             "text": text,
-#             "is_finished": is_finished,
-#             "finish_reason": finish_reason,
-#             "prompt_tokens": prompt_tokens,
-#             "completion_tokens": completion_tokens,
-#         }
-
-#     def handle_predibase_chunk(self, chunk):
-#         try:
-#             if not isinstance(chunk, str):
-#                 chunk = chunk.decode(
-#                     "utf-8"
-#                 )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
-#             text = ""
-#             is_finished = False
-#             finish_reason = ""
-#             print_verbose(f"chunk: {chunk}")
-#             if chunk.startswith("data:"):
-#                 data_json = json.loads(chunk[5:])
-#                 print_verbose(f"data json: {data_json}")
-#                 if "token" in data_json and "text" in data_json["token"]:
-#                     text = data_json["token"]["text"]
-#                 if data_json.get("details", False) and data_json["details"].get(
-#                     "finish_reason", False
-#                 ):
-#                     is_finished = True
-#                     finish_reason = data_json["details"]["finish_reason"]
-#                 elif data_json.get(
-#                     "generated_text", False
-#                 ):  # if full generated text exists, then stream is complete
-#                     text = ""  # don't return the final bos token
-#                     is_finished = True
-#                     finish_reason = "stop"
-#                 elif data_json.get("error", False):
-#                     raise Exception(data_json.get("error"))
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             elif "error" in chunk:
-#                 raise ValueError(chunk)
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception as e:
-#             raise e
-
-#     def handle_huggingface_chunk(self, chunk):
-#         try:
-#             if not isinstance(chunk, str):
-#                 chunk = chunk.decode(
-#                     "utf-8"
-#                 )  # DO NOT REMOVE this: This is required for HF inference API + Streaming
-#             text = ""
-#             is_finished = False
-#             finish_reason = ""
-#             print_verbose(f"chunk: {chunk}")
-#             if chunk.startswith("data:"):
-#                 data_json = json.loads(chunk[5:])
-#                 print_verbose(f"data json: {data_json}")
-#                 if "token" in data_json and "text" in data_json["token"]:
-#                     text = data_json["token"]["text"]
-#                 if data_json.get("details", False) and data_json["details"].get(
-#                     "finish_reason", False
-#                 ):
-#                     is_finished = True
-#                     finish_reason = data_json["details"]["finish_reason"]
-#                 elif data_json.get(
-#                     "generated_text", False
-#                 ):  # if full generated text exists, then stream is complete
-#                     text = ""  # don't return the final bos token
-#                     is_finished = True
-#                     finish_reason = "stop"
-#                 elif data_json.get("error", False):
-#                     raise Exception(data_json.get("error"))
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             elif "error" in chunk:
-#                 raise ValueError(chunk)
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception as e:
-#             raise e
-
-#     def handle_ai21_chunk(self, chunk):  # fake streaming
-#         chunk = chunk.decode("utf-8")
-#         data_json = json.loads(chunk)
-#         try:
-#             text = data_json["completions"][0]["data"]["text"]
-#             is_finished = True
-#             finish_reason = "stop"
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_maritalk_chunk(self, chunk):  # fake streaming
-#         chunk = chunk.decode("utf-8")
-#         data_json = json.loads(chunk)
-#         try:
-#             text = data_json["answer"]
-#             is_finished = True
-#             finish_reason = "stop"
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_nlp_cloud_chunk(self, chunk):
-#         text = ""
-#         is_finished = False
-#         finish_reason = ""
-#         try:
-#             if "dolphin" in self.model:
-#                 chunk = self.process_chunk(chunk=chunk)
-#             else:
-#                 data_json = json.loads(chunk)
-#                 chunk = data_json["generated_text"]
-#             text = chunk
-#             if "[DONE]" in text:
-#                 text = text.replace("[DONE]", "")
-#                 is_finished = True
-#                 finish_reason = "stop"
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_aleph_alpha_chunk(self, chunk):
-#         chunk = chunk.decode("utf-8")
-#         data_json = json.loads(chunk)
-#         try:
-#             text = data_json["completions"][0]["completion"]
-#             is_finished = True
-#             finish_reason = "stop"
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_cohere_chunk(self, chunk):
-#         chunk = chunk.decode("utf-8")
-#         data_json = json.loads(chunk)
-#         try:
-#             text = ""
-#             is_finished = False
-#             finish_reason = ""
-#             index: Optional[int] = None
-#             if "index" in data_json:
-#                 index = data_json.get("index")
-#             if "text" in data_json:
-#                 text = data_json["text"]
-#             elif "is_finished" in data_json:
-#                 is_finished = data_json["is_finished"]
-#                 finish_reason = data_json["finish_reason"]
-#             else:
-#                 raise Exception(data_json)
-#             return {
-#                 "index": index,
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_cohere_chat_chunk(self, chunk):
-#         chunk = chunk.decode("utf-8")
-#         data_json = json.loads(chunk)
-#         print_verbose(f"chunk: {chunk}")
-#         try:
-#             text = ""
-#             is_finished = False
-#             finish_reason = ""
-#             if "text" in data_json:
-#                 text = data_json["text"]
-#             elif "is_finished" in data_json and data_json["is_finished"] is True:
-#                 is_finished = data_json["is_finished"]
-#                 finish_reason = data_json["finish_reason"]
-#             else:
-#                 return
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_azure_chunk(self, chunk):
-#         is_finished = False
-#         finish_reason = ""
-#         text = ""
-#         print_verbose(f"chunk: {chunk}")
-#         if "data: [DONE]" in chunk:
-#             text = ""
-#             is_finished = True
-#             finish_reason = "stop"
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         elif chunk.startswith("data:"):
-#             data_json = json.loads(chunk[5:])  # chunk.startswith("data:"):
-#             try:
-#                 if len(data_json["choices"]) > 0:
-#                     delta = data_json["choices"][0]["delta"]
-#                     text = "" if delta is None else delta.get("content", "")
-#                     if data_json["choices"][0].get("finish_reason", None):
-#                         is_finished = True
-#                         finish_reason = data_json["choices"][0]["finish_reason"]
-#                 print_verbose(
-#                     f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}"
-#                 )
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             except Exception:
-#                 raise ValueError(
-#                     f"Unable to parse response. Original response: {chunk}"
-#                 )
-#         elif "error" in chunk:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-#         else:
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-
-#     def handle_replicate_chunk(self, chunk):
-#         try:
-#             text = ""
-#             is_finished = False
-#             finish_reason = ""
-#             if "output" in chunk:
-#                 text = chunk["output"]
-#             if "status" in chunk:
-#                 if chunk["status"] == "succeeded":
-#                     is_finished = True
-#                     finish_reason = "stop"
-#             elif chunk.get("error", None):
-#                 raise Exception(chunk["error"])
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-#         except Exception:
-#             raise ValueError(f"Unable to parse response. Original response: {chunk}")
-
-#     def handle_openai_chat_completion_chunk(self, chunk):
-#         try:
-#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-#             str_line = chunk
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-#             logprobs = None
-#             usage = None
-#             if str_line and str_line.choices and len(str_line.choices) > 0:
-#                 if (
-#                     str_line.choices[0].delta is not None
-#                     and str_line.choices[0].delta.content is not None
-#                 ):
-#                     text = str_line.choices[0].delta.content
-#                 else:  # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai
-#                     pass
-#                 if str_line.choices[0].finish_reason:
-#                     is_finished = True
-#                     finish_reason = str_line.choices[0].finish_reason
-
-#                 # checking for logprobs
-#                 if (
-#                     hasattr(str_line.choices[0], "logprobs")
-#                     and str_line.choices[0].logprobs is not None
-#                 ):
-#                     logprobs = str_line.choices[0].logprobs
-#                 else:
-#                     logprobs = None
-
-#             usage = getattr(str_line, "usage", None)
-
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#                 "logprobs": logprobs,
-#                 "original_chunk": str_line,
-#                 "usage": usage,
-#             }
-#         except Exception as e:
-#             raise e
-
-#     def handle_azure_text_completion_chunk(self, chunk):
-#         try:
-#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-#             choices = getattr(chunk, "choices", [])
-#             if len(choices) > 0:
-#                 text = choices[0].text
-#                 if choices[0].finish_reason is not None:
-#                     is_finished = True
-#                     finish_reason = choices[0].finish_reason
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#             }
-
-#         except Exception as e:
-#             raise e
-
-#     def handle_openai_text_completion_chunk(self, chunk):
-#         try:
-#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-#             usage = None
-#             choices = getattr(chunk, "choices", [])
-#             if len(choices) > 0:
-#                 text = choices[0].text
-#                 if choices[0].finish_reason is not None:
-#                     is_finished = True
-#                     finish_reason = choices[0].finish_reason
-#             usage = getattr(chunk, "usage", None)
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#                 "usage": usage,
-#             }
-
-#         except Exception as e:
-#             raise e
-
-#     def handle_baseten_chunk(self, chunk):
-#         try:
-#             chunk = chunk.decode("utf-8")
-#             if len(chunk) > 0:
-#                 if chunk.startswith("data:"):
-#                     data_json = json.loads(chunk[5:])
-#                     if "token" in data_json and "text" in data_json["token"]:
-#                         return data_json["token"]["text"]
-#                     else:
-#                         return ""
-#                 data_json = json.loads(chunk)
-#                 if "model_output" in data_json:
-#                     if (
-#                         isinstance(data_json["model_output"], dict)
-#                         and "data" in data_json["model_output"]
-#                         and isinstance(data_json["model_output"]["data"], list)
-#                     ):
-#                         return data_json["model_output"]["data"][0]
-#                     elif isinstance(data_json["model_output"], str):
-#                         return data_json["model_output"]
-#                     elif "completion" in data_json and isinstance(
-#                         data_json["completion"], str
-#                     ):
-#                         return data_json["completion"]
-#                     else:
-#                         raise ValueError(
-#                             f"Unable to parse response. Original response: {chunk}"
-#                         )
-#                 else:
-#                     return ""
-#             else:
-#                 return ""
-#         except Exception as e:
-#             verbose_logger.exception(
-#                 "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format(
-#                     str(e)
-#                 )
-#             )
-#             return ""
-
-#     def handle_cloudlfare_stream(self, chunk):
-#         try:
-#             print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
-#             chunk = chunk.decode("utf-8")
-#             str_line = chunk
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-
-#             if "[DONE]" in chunk:
-#                 return {"text": text, "is_finished": True, "finish_reason": "stop"}
-#             elif str_line.startswith("data:"):
-#                 data_json = json.loads(str_line[5:])
-#                 print_verbose(f"delta content: {data_json}")
-#                 text = data_json["response"]
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             else:
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-
-#         except Exception as e:
-#             raise e
-
-#     def handle_ollama_stream(self, chunk):
-#         try:
-#             if isinstance(chunk, dict):
-#                 json_chunk = chunk
-#             else:
-#                 json_chunk = json.loads(chunk)
-#             if "error" in json_chunk:
-#                 raise Exception(f"Ollama Error - {json_chunk}")
-
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-#             if json_chunk["done"] is True:
-#                 text = ""
-#                 is_finished = True
-#                 finish_reason = "stop"
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             elif json_chunk["response"]:
-#                 print_verbose(f"delta content: {json_chunk}")
-#                 text = json_chunk["response"]
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             else:
-#                 raise Exception(f"Ollama Error - {json_chunk}")
-#         except Exception as e:
-#             raise e
-
-#     def handle_ollama_chat_stream(self, chunk):
-#         # for ollama_chat/ provider
-#         try:
-#             if isinstance(chunk, dict):
-#                 json_chunk = chunk
-#             else:
-#                 json_chunk = json.loads(chunk)
-#             if "error" in json_chunk:
-#                 raise Exception(f"Ollama Error - {json_chunk}")
-
-#             text = ""
-#             is_finished = False
-#             finish_reason = None
-#             if json_chunk["done"] is True:
-#                 text = ""
-#                 is_finished = True
-#                 finish_reason = "stop"
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             elif "message" in json_chunk:
-#                 print_verbose(f"delta content: {json_chunk}")
-#                 text = json_chunk["message"]["content"]
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                 }
-#             else:
-#                 raise Exception(f"Ollama Error - {json_chunk}")
-#         except Exception as e:
-#             raise e
-
-#     def handle_watsonx_stream(self, chunk):
-#         try:
-#             if isinstance(chunk, dict):
-#                 parsed_response = chunk
-#             elif isinstance(chunk, (str, bytes)):
-#                 if isinstance(chunk, bytes):
-#                     chunk = chunk.decode("utf-8")
-#                 if "generated_text" in chunk:
-#                     response = chunk.replace("data: ", "").strip()
-#                     parsed_response = json.loads(response)
-#                 else:
-#                     return {
-#                         "text": "",
-#                         "is_finished": False,
-#                         "prompt_tokens": 0,
-#                         "completion_tokens": 0,
-#                     }
-#             else:
-#                 print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
-#                 raise ValueError(
-#                     f"Unable to parse response. Original response: {chunk}"
-#                 )
-#             results = parsed_response.get("results", [])
-#             if len(results) > 0:
-#                 text = results[0].get("generated_text", "")
-#                 finish_reason = results[0].get("stop_reason")
-#                 is_finished = finish_reason != "not_finished"
-#                 return {
-#                     "text": text,
-#                     "is_finished": is_finished,
-#                     "finish_reason": finish_reason,
-#                     "prompt_tokens": results[0].get("input_token_count", 0),
-#                     "completion_tokens": results[0].get("generated_token_count", 0),
-#                 }
-#             return {"text": "", "is_finished": False}
-#         except Exception as e:
-#             raise e
-
-#     def handle_triton_stream(self, chunk):
-#         try:
-#             if isinstance(chunk, dict):
-#                 parsed_response = chunk
-#             elif isinstance(chunk, (str, bytes)):
-#                 if isinstance(chunk, bytes):
-#                     chunk = chunk.decode("utf-8")
-#                 if "text_output" in chunk:
-#                     response = chunk.replace("data: ", "").strip()
-#                     parsed_response = json.loads(response)
-#                 else:
-#                     return {
-#                         "text": "",
-#                         "is_finished": False,
-#                         "prompt_tokens": 0,
-#                         "completion_tokens": 0,
-#                     }
-#             else:
-#                 print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
-#                 raise ValueError(
-#                     f"Unable to parse response. Original response: {chunk}"
-#                 )
-#             text = parsed_response.get("text_output", "")
-#             finish_reason = parsed_response.get("stop_reason")
-#             is_finished = parsed_response.get("is_finished", False)
-#             return {
-#                 "text": text,
-#                 "is_finished": is_finished,
-#                 "finish_reason": finish_reason,
-#                 "prompt_tokens": parsed_response.get("input_token_count", 0),
-#                 "completion_tokens": parsed_response.get("generated_token_count", 0),
-#             }
-#             return {"text": "", "is_finished": False}
-#         except Exception as e:
-#             raise e
-
-#     def handle_clarifai_completion_chunk(self, chunk):
-#         try:
-#             if isinstance(chunk, dict):
-#                 parsed_response = chunk
-#             elif isinstance(chunk, (str, bytes)):
-#                 if isinstance(chunk, bytes):
-#                     parsed_response = chunk.decode("utf-8")
-#                 else:
-#                     parsed_response = chunk
-#             else:
-#                 raise ValueError("Unable to parse streaming chunk")
-#             if isinstance(parsed_response, dict):
-#                 data_json = parsed_response
-#             else:
-#                 data_json = json.loads(parsed_response)
-#             text = (
-#                 data_json.get("outputs", "")[0]
-#                 .get("data", "")
-#                 .get("text", "")
-#                 .get("raw", "")
-#             )
-#             len(
-#                 encoding.encode(
-#                     data_json.get("outputs", "")[0]
-#                     .get("input", "")
-#                     .get("data", "")
-#                     .get("text", "")
-#                     .get("raw", "")
-#                 )
-#             )
-#             len(encoding.encode(text))
-#             return {
-#                 "text": text,
-#                 "is_finished": True,
-#             }
-#         except Exception as e:
-#             verbose_logger.exception(
-#                 "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format(
-#                     str(e)
-#                 )
-#             )
-#             return ""
-
-#     def model_response_creator(
-#         self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None
-#     ):
-#         _model = self.model
-#         _received_llm_provider = self.custom_llm_provider
-#         _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None)  # type: ignore
-#         if (
-#             _received_llm_provider == "openai"
-#             and _received_llm_provider != _logging_obj_llm_provider
-#         ):
-#             _model = "{}/{}".format(_logging_obj_llm_provider, _model)
-#         if chunk is None:
-#             chunk = {}
-#         else:
-#             # pop model keyword
-#             chunk.pop("model", None)
-
-#         model_response = ModelResponse(
-#             stream=True, model=_model, stream_options=self.stream_options, **chunk
-#         )
-#         if self.response_id is not None:
-#             model_response.id = self.response_id
-#         else:
-#             self.response_id = model_response.id  # type: ignore
-#         if self.system_fingerprint is not None:
-#             model_response.system_fingerprint = self.system_fingerprint
-#         if hidden_params is not None:
-#             model_response._hidden_params = hidden_params
-#         model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
-#         model_response._hidden_params["created_at"] = time.time()
-#         model_response._hidden_params = {
-#             **model_response._hidden_params,
-#             **self._hidden_params,
-#         }
-
-#         if (
-#             len(model_response.choices) > 0
-#             and getattr(model_response.choices[0], "delta") is not None
-#         ):
-#             # do nothing, if object instantiated
-#             pass
-#         else:
-#             model_response.choices = [StreamingChoices(finish_reason=None)]
-#         return model_response
-
-#     def is_delta_empty(self, delta: Delta) -> bool:
-#         is_empty = True
-#         if delta.content is not None:
-#             is_empty = False
-#         elif delta.tool_calls is not None:
-#             is_empty = False
-#         elif delta.function_call is not None:
-#             is_empty = False
-#         return is_empty
-
-#     def return_processed_chunk_logic(  # noqa
-#         self,
-#         completion_obj: dict,
-#         model_response: ModelResponseStream,
-#         response_obj: dict,
-#     ):
-
-#         print_verbose(
-#             f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
-#         )
-#         if (
-#             "content" in completion_obj
-#             and (
-#                 isinstance(completion_obj["content"], str)
-#                 and len(completion_obj["content"]) > 0
-#             )
-#             or (
-#                 "tool_calls" in completion_obj
-#                 and completion_obj["tool_calls"] is not None
-#                 and len(completion_obj["tool_calls"]) > 0
-#             )
-#             or (
-#                 "function_call" in completion_obj
-#                 and completion_obj["function_call"] is not None
-#             )
-#         ):  # cannot set content of an OpenAI Object to be an empty string
-#             self.safety_checker()
-#             hold, model_response_str = self.check_special_tokens(
-#                 chunk=completion_obj["content"],
-#                 finish_reason=model_response.choices[0].finish_reason,
-#             )  # filter out bos/eos tokens from openai-compatible hf endpoints
-#             print_verbose(f"hold - {hold}, model_response_str - {model_response_str}")
-#             if hold is False:
-#                 ## check if openai/azure chunk
-#                 original_chunk = response_obj.get("original_chunk", None)
-#                 if original_chunk:
-#                     model_response.id = original_chunk.id
-#                     self.response_id = original_chunk.id
-#                     if len(original_chunk.choices) > 0:
-#                         choices = []
-#                         for choice in original_chunk.choices:
-#                             try:
-#                                 if isinstance(choice, BaseModel):
-#                                     choice_json = choice.model_dump()
-#                                     choice_json.pop(
-#                                         "finish_reason", None
-#                                     )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
-#                                     print_verbose(f"choice_json: {choice_json}")
-#                                     choices.append(StreamingChoices(**choice_json))
-#                             except Exception:
-#                                 choices.append(StreamingChoices())
-#                         print_verbose(f"choices in streaming: {choices}")
-#                         setattr(model_response, "choices", choices)
-#                     else:
-#                         return
-#                     model_response.system_fingerprint = (
-#                         original_chunk.system_fingerprint
-#                     )
-#                     setattr(
-#                         model_response,
-#                         "citations",
-#                         getattr(original_chunk, "citations", None),
-#                     )
-#                     print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-#                     if self.sent_first_chunk is False:
-#                         model_response.choices[0].delta["role"] = "assistant"
-#                         self.sent_first_chunk = True
-#                     elif self.sent_first_chunk is True and hasattr(
-#                         model_response.choices[0].delta, "role"
-#                     ):
-#                         _initial_delta = model_response.choices[0].delta.model_dump()
-#                         _initial_delta.pop("role", None)
-#                         model_response.choices[0].delta = Delta(**_initial_delta)
-#                     print_verbose(
-#                         f"model_response.choices[0].delta: {model_response.choices[0].delta}"
-#                     )
-#                 else:
-#                     ## else
-#                     completion_obj["content"] = model_response_str
-#                     if self.sent_first_chunk is False:
-#                         completion_obj["role"] = "assistant"
-#                         self.sent_first_chunk = True
-
-#                     model_response.choices[0].delta = Delta(**completion_obj)
-#                     _index: Optional[int] = completion_obj.get("index")
-#                     if _index is not None:
-#                         model_response.choices[0].index = _index
-#                 print_verbose(f"returning model_response: {model_response}")
-#                 return model_response
-#             else:
-#                 return
-#         elif self.received_finish_reason is not None:
-#             if self.sent_last_chunk is True:
-#                 # Bedrock returns the guardrail trace in the last chunk - we want to return this here
-#                 if self.custom_llm_provider == "bedrock" and "trace" in model_response:
-#                     return model_response
-
-#                 # Default - return StopIteration
-#                 raise StopIteration
-#             # flush any remaining holding chunk
-#             if len(self.holding_chunk) > 0:
-#                 if model_response.choices[0].delta.content is None:
-#                     model_response.choices[0].delta.content = self.holding_chunk
-#                 else:
-#                     model_response.choices[0].delta.content = (
-#                         self.holding_chunk + model_response.choices[0].delta.content
-#                     )
-#                 self.holding_chunk = ""
-#             # if delta is None
-#             _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta)
-
-#             if _is_delta_empty:
-#                 # get any function call arguments
-#                 model_response.choices[0].finish_reason = map_finish_reason(
-#                     finish_reason=self.received_finish_reason
-#                 )  # ensure consistent output to openai
-
-#                 self.sent_last_chunk = True
-
-#             return model_response
-#         elif (
-#             model_response.choices[0].delta.tool_calls is not None
-#             or model_response.choices[0].delta.function_call is not None
-#         ):
-#             if self.sent_first_chunk is False:
-#                 model_response.choices[0].delta["role"] = "assistant"
-#                 self.sent_first_chunk = True
-#             return model_response
-#         elif (
-#             len(model_response.choices) > 0
-#             and hasattr(model_response.choices[0].delta, "audio")
-#             and model_response.choices[0].delta.audio is not None
-#         ):
-#             return model_response
-#         else:
-#             if hasattr(model_response, "usage"):
-#                 self.chunks.append(model_response)
-#             return
-
-#     def chunk_creator(self, chunk):  # type: ignore  # noqa: PLR0915
-#         model_response = self.model_response_creator()
-#         response_obj: dict = {}
-#         try:
-#             # return this for all models
-#             completion_obj = {"content": ""}
-#             from litellm.litellm_core_utils.streaming_utils import (
-#                 generic_chunk_has_all_required_fields,
-#             )
-#             from litellm.types.utils import GenericStreamingChunk as GChunk
-
-#             if (
-#                 isinstance(chunk, dict)
-#                 and generic_chunk_has_all_required_fields(
-#                     chunk=chunk
-#                 )  # check if chunk is a generic streaming chunk
-#             ) or (
-#                 self.custom_llm_provider
-#                 and (
-#                     self.custom_llm_provider == "anthropic"
-#                     or self.custom_llm_provider in litellm._custom_providers
-#                 )
-#             ):
-
-#                 if self.received_finish_reason is not None:
-#                     if "provider_specific_fields" not in chunk:
-#                         raise StopIteration
-#                 anthropic_response_obj: GChunk = chunk
-#                 completion_obj["content"] = anthropic_response_obj["text"]
-#                 if anthropic_response_obj["is_finished"]:
-#                     self.received_finish_reason = anthropic_response_obj[
-#                         "finish_reason"
-#                     ]
-
-#                 if anthropic_response_obj["usage"] is not None:
-#                     model_response.usage = litellm.Usage(
-#                         **anthropic_response_obj["usage"]
-#                     )
-
-#                 if (
-#                     "tool_use" in anthropic_response_obj
-#                     and anthropic_response_obj["tool_use"] is not None
-#                 ):
-#                     completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]]
-
-#                 if (
-#                     "provider_specific_fields" in anthropic_response_obj
-#                     and anthropic_response_obj["provider_specific_fields"] is not None
-#                 ):
-#                     for key, value in anthropic_response_obj[
-#                         "provider_specific_fields"
-#                     ].items():
-#                         setattr(model_response, key, value)
-
-#                 response_obj = anthropic_response_obj
-#             elif (
-#                 self.custom_llm_provider
-#                 and self.custom_llm_provider == "anthropic_text"
-#             ):
-#                 response_obj = self.handle_anthropic_text_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider and self.custom_llm_provider == "clarifai":
-#                 response_obj = self.handle_clarifai_completion_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.model == "replicate" or self.custom_llm_provider == "replicate":
-#                 response_obj = self.handle_replicate_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
-#                 response_obj = self.handle_huggingface_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider and self.custom_llm_provider == "predibase":
-#                 response_obj = self.handle_predibase_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif (
-#                 self.custom_llm_provider and self.custom_llm_provider == "baseten"
-#             ):  # baseten doesn't provide streaming
-#                 completion_obj["content"] = self.handle_baseten_chunk(chunk)
-#             elif (
-#                 self.custom_llm_provider and self.custom_llm_provider == "ai21"
-#             ):  # ai21 doesn't provide streaming
-#                 response_obj = self.handle_ai21_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider and self.custom_llm_provider == "maritalk":
-#                 response_obj = self.handle_maritalk_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
-#                 completion_obj["content"] = chunk[0].outputs[0].text
-#             elif (
-#                 self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha"
-#             ):  # aleph alpha doesn't provide streaming
-#                 response_obj = self.handle_aleph_alpha_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "nlp_cloud":
-#                 try:
-#                     response_obj = self.handle_nlp_cloud_chunk(chunk)
-#                     completion_obj["content"] = response_obj["text"]
-#                     if response_obj["is_finished"]:
-#                         self.received_finish_reason = response_obj["finish_reason"]
-#                 except Exception as e:
-#                     if self.received_finish_reason:
-#                         raise e
-#                     else:
-#                         if self.sent_first_chunk is False:
-#                             raise Exception("An unknown error occurred with the stream")
-#                         self.received_finish_reason = "stop"
-#             elif self.custom_llm_provider == "vertex_ai":
-#                 import proto  # type: ignore
-
-#                 if self.model.startswith("claude-3"):
-#                     response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk)
-#                     if response_obj is None:
-#                         return
-#                     completion_obj["content"] = response_obj["text"]
-#                     setattr(model_response, "usage", Usage())
-#                     if response_obj.get("prompt_tokens", None) is not None:
-#                         model_response.usage.prompt_tokens = response_obj[
-#                             "prompt_tokens"
-#                         ]
-#                     if response_obj.get("completion_tokens", None) is not None:
-#                         model_response.usage.completion_tokens = response_obj[
-#                             "completion_tokens"
-#                         ]
-#                     if hasattr(model_response.usage, "prompt_tokens"):
-#                         model_response.usage.total_tokens = (
-#                             getattr(model_response.usage, "total_tokens", 0)
-#                             + model_response.usage.prompt_tokens
-#                         )
-#                     if hasattr(model_response.usage, "completion_tokens"):
-#                         model_response.usage.total_tokens = (
-#                             getattr(model_response.usage, "total_tokens", 0)
-#                             + model_response.usage.completion_tokens
-#                         )
-
-#                     if response_obj["is_finished"]:
-#                         self.received_finish_reason = response_obj["finish_reason"]
-#                 elif hasattr(chunk, "candidates") is True:
-#                     try:
-#                         try:
-#                             completion_obj["content"] = chunk.text
-#                         except Exception as e:
-#                             if "Part has no text." in str(e):
-#                                 ## check for function calling
-#                                 function_call = (
-#                                     chunk.candidates[0].content.parts[0].function_call
-#                                 )
-
-#                                 args_dict = {}
-
-#                                 # Check if it's a RepeatedComposite instance
-#                                 for key, val in function_call.args.items():
-#                                     if isinstance(
-#                                         val,
-#                                         proto.marshal.collections.repeated.RepeatedComposite,
-#                                     ):
-#                                         # If so, convert to list
-#                                         args_dict[key] = [v for v in val]
-#                                     else:
-#                                         args_dict[key] = val
-
-#                                 try:
-#                                     args_str = json.dumps(args_dict)
-#                                 except Exception as e:
-#                                     raise e
-#                                 _delta_obj = litellm.utils.Delta(
-#                                     content=None,
-#                                     tool_calls=[
-#                                         {
-#                                             "id": f"call_{str(uuid.uuid4())}",
-#                                             "function": {
-#                                                 "arguments": args_str,
-#                                                 "name": function_call.name,
-#                                             },
-#                                             "type": "function",
-#                                         }
-#                                     ],
-#                                 )
-#                                 _streaming_response = StreamingChoices(delta=_delta_obj)
-#                                 _model_response = ModelResponse(stream=True)
-#                                 _model_response.choices = [_streaming_response]
-#                                 response_obj = {"original_chunk": _model_response}
-#                             else:
-#                                 raise e
-#                         if (
-#                             hasattr(chunk.candidates[0], "finish_reason")
-#                             and chunk.candidates[0].finish_reason.name
-#                             != "FINISH_REASON_UNSPECIFIED"
-#                         ):  # every non-final chunk in vertex ai has this
-#                             self.received_finish_reason = chunk.candidates[
-#                                 0
-#                             ].finish_reason.name
-#                     except Exception:
-#                         if chunk.candidates[0].finish_reason.name == "SAFETY":
-#                             raise Exception(
-#                                 f"The response was blocked by VertexAI. {str(chunk)}"
-#                             )
-#                 else:
-#                     completion_obj["content"] = str(chunk)
-#             elif self.custom_llm_provider == "cohere":
-#                 response_obj = self.handle_cohere_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "cohere_chat":
-#                 response_obj = self.handle_cohere_chat_chunk(chunk)
-#                 if response_obj is None:
-#                     return
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-
-#             elif self.custom_llm_provider == "petals":
-#                 if len(self.completion_stream) == 0:
-#                     if self.received_finish_reason is not None:
-#                         raise StopIteration
-#                     else:
-#                         self.received_finish_reason = "stop"
-#                 chunk_size = 30
-#                 new_chunk = self.completion_stream[:chunk_size]
-#                 completion_obj["content"] = new_chunk
-#                 self.completion_stream = self.completion_stream[chunk_size:]
-#             elif self.custom_llm_provider == "palm":
-#                 # fake streaming
-#                 response_obj = {}
-#                 if len(self.completion_stream) == 0:
-#                     if self.received_finish_reason is not None:
-#                         raise StopIteration
-#                     else:
-#                         self.received_finish_reason = "stop"
-#                 chunk_size = 30
-#                 new_chunk = self.completion_stream[:chunk_size]
-#                 completion_obj["content"] = new_chunk
-#                 self.completion_stream = self.completion_stream[chunk_size:]
-#             elif self.custom_llm_provider == "ollama":
-#                 response_obj = self.handle_ollama_stream(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "ollama_chat":
-#                 response_obj = self.handle_ollama_chat_stream(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "cloudflare":
-#                 response_obj = self.handle_cloudlfare_stream(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "watsonx":
-#                 response_obj = self.handle_watsonx_stream(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "triton":
-#                 response_obj = self.handle_triton_stream(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "text-completion-openai":
-#                 response_obj = self.handle_openai_text_completion_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#                 if response_obj["usage"] is not None:
-#                     model_response.usage = litellm.Usage(
-#                         prompt_tokens=response_obj["usage"].prompt_tokens,
-#                         completion_tokens=response_obj["usage"].completion_tokens,
-#                         total_tokens=response_obj["usage"].total_tokens,
-#                     )
-#             elif self.custom_llm_provider == "text-completion-codestral":
-#                 response_obj = litellm.MistralTextCompletionConfig()._chunk_parser(
-#                     chunk
-#                 )
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#                 if "usage" in response_obj is not None:
-#                     model_response.usage = litellm.Usage(
-#                         prompt_tokens=response_obj["usage"].prompt_tokens,
-#                         completion_tokens=response_obj["usage"].completion_tokens,
-#                         total_tokens=response_obj["usage"].total_tokens,
-#                     )
-#             elif self.custom_llm_provider == "azure_text":
-#                 response_obj = self.handle_azure_text_completion_chunk(chunk)
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             elif self.custom_llm_provider == "cached_response":
-#                 response_obj = {
-#                     "text": chunk.choices[0].delta.content,
-#                     "is_finished": True,
-#                     "finish_reason": chunk.choices[0].finish_reason,
-#                     "original_chunk": chunk,
-#                     "tool_calls": (
-#                         chunk.choices[0].delta.tool_calls
-#                         if hasattr(chunk.choices[0].delta, "tool_calls")
-#                         else None
-#                     ),
-#                 }
-
-#                 completion_obj["content"] = response_obj["text"]
-#                 if response_obj["tool_calls"] is not None:
-#                     completion_obj["tool_calls"] = response_obj["tool_calls"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if hasattr(chunk, "id"):
-#                     model_response.id = chunk.id
-#                     self.response_id = chunk.id
-#                 if hasattr(chunk, "system_fingerprint"):
-#                     self.system_fingerprint = chunk.system_fingerprint
-#                 if response_obj["is_finished"]:
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#             else:  # openai / azure chat model
-#                 if self.custom_llm_provider == "azure":
-#                     if hasattr(chunk, "model"):
-#                         # for azure, we need to pass the model from the orignal chunk
-#                         self.model = chunk.model
-#                 response_obj = self.handle_openai_chat_completion_chunk(chunk)
-#                 if response_obj is None:
-#                     return
-#                 completion_obj["content"] = response_obj["text"]
-#                 print_verbose(f"completion obj content: {completion_obj['content']}")
-#                 if response_obj["is_finished"]:
-#                     if response_obj["finish_reason"] == "error":
-#                         raise Exception(
-#                             "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format(
-#                                 self.custom_llm_provider, response_obj
-#                             )
-#                         )
-#                     self.received_finish_reason = response_obj["finish_reason"]
-#                 if response_obj.get("original_chunk", None) is not None:
-#                     if hasattr(response_obj["original_chunk"], "id"):
-#                         model_response.id = response_obj["original_chunk"].id
-#                         self.response_id = model_response.id
-#                     if hasattr(response_obj["original_chunk"], "system_fingerprint"):
-#                         model_response.system_fingerprint = response_obj[
-#                             "original_chunk"
-#                         ].system_fingerprint
-#                         self.system_fingerprint = response_obj[
-#                             "original_chunk"
-#                         ].system_fingerprint
-#                 if response_obj["logprobs"] is not None:
-#                     model_response.choices[0].logprobs = response_obj["logprobs"]
-
-#                 if response_obj["usage"] is not None:
-#                     if isinstance(response_obj["usage"], dict):
-#                         model_response.usage = litellm.Usage(
-#                             prompt_tokens=response_obj["usage"].get(
-#                                 "prompt_tokens", None
-#                             )
-#                             or None,
-#                             completion_tokens=response_obj["usage"].get(
-#                                 "completion_tokens", None
-#                             )
-#                             or None,
-#                             total_tokens=response_obj["usage"].get("total_tokens", None)
-#                             or None,
-#                         )
-#                     elif isinstance(response_obj["usage"], BaseModel):
-#                         model_response.usage = litellm.Usage(
-#                             **response_obj["usage"].model_dump()
-#                         )
-
-#             model_response.model = self.model
-#             print_verbose(
-#                 f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
-#             )
-#             ## FUNCTION CALL PARSING
-#             if (
-#                 response_obj is not None
-#                 and response_obj.get("original_chunk", None) is not None
-#             ):  # function / tool calling branch - only set for openai/azure compatible endpoints
-#                 # enter this branch when no content has been passed in response
-#                 original_chunk = response_obj.get("original_chunk", None)
-#                 model_response.id = original_chunk.id
-#                 self.response_id = original_chunk.id
-#                 if original_chunk.choices and len(original_chunk.choices) > 0:
-#                     delta = original_chunk.choices[0].delta
-#                     if delta is not None and (
-#                         delta.function_call is not None or delta.tool_calls is not None
-#                     ):
-#                         try:
-#                             model_response.system_fingerprint = (
-#                                 original_chunk.system_fingerprint
-#                             )
-#                             ## AZURE - check if arguments is not None
-#                             if (
-#                                 original_chunk.choices[0].delta.function_call
-#                                 is not None
-#                             ):
-#                                 if (
-#                                     getattr(
-#                                         original_chunk.choices[0].delta.function_call,
-#                                         "arguments",
-#                                     )
-#                                     is None
-#                                 ):
-#                                     original_chunk.choices[
-#                                         0
-#                                     ].delta.function_call.arguments = ""
-#                             elif original_chunk.choices[0].delta.tool_calls is not None:
-#                                 if isinstance(
-#                                     original_chunk.choices[0].delta.tool_calls, list
-#                                 ):
-#                                     for t in original_chunk.choices[0].delta.tool_calls:
-#                                         if hasattr(t, "functions") and hasattr(
-#                                             t.functions, "arguments"
-#                                         ):
-#                                             if (
-#                                                 getattr(
-#                                                     t.function,
-#                                                     "arguments",
-#                                                 )
-#                                                 is None
-#                                             ):
-#                                                 t.function.arguments = ""
-#                             _json_delta = delta.model_dump()
-#                             print_verbose(f"_json_delta: {_json_delta}")
-#                             if "role" not in _json_delta or _json_delta["role"] is None:
-#                                 _json_delta["role"] = (
-#                                     "assistant"  # mistral's api returns role as None
-#                                 )
-#                             if "tool_calls" in _json_delta and isinstance(
-#                                 _json_delta["tool_calls"], list
-#                             ):
-#                                 for tool in _json_delta["tool_calls"]:
-#                                     if (
-#                                         isinstance(tool, dict)
-#                                         and "function" in tool
-#                                         and isinstance(tool["function"], dict)
-#                                         and ("type" not in tool or tool["type"] is None)
-#                                     ):
-#                                         # if function returned but type set to None - mistral's api returns type: None
-#                                         tool["type"] = "function"
-#                             model_response.choices[0].delta = Delta(**_json_delta)
-#                         except Exception as e:
-#                             verbose_logger.exception(
-#                                 "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
-#                                     str(e)
-#                                 )
-#                             )
-#                             model_response.choices[0].delta = Delta()
-#                     elif (
-#                         delta is not None and getattr(delta, "audio", None) is not None
-#                     ):
-#                         model_response.choices[0].delta.audio = delta.audio
-#                     else:
-#                         try:
-#                             delta = (
-#                                 dict()
-#                                 if original_chunk.choices[0].delta is None
-#                                 else dict(original_chunk.choices[0].delta)
-#                             )
-#                             print_verbose(f"original delta: {delta}")
-#                             model_response.choices[0].delta = Delta(**delta)
-#                             print_verbose(
-#                                 f"new delta: {model_response.choices[0].delta}"
-#                             )
-#                         except Exception:
-#                             model_response.choices[0].delta = Delta()
-#                 else:
-#                     if (
-#                         self.stream_options is not None
-#                         and self.stream_options["include_usage"] is True
-#                     ):
-#                         return model_response
-#                     return
-#             print_verbose(
-#                 f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
-#             )
-#             print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-
-#             ## CHECK FOR TOOL USE
-#             if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0:
-#                 if self.is_function_call is True:  # user passed in 'functions' param
-#                     completion_obj["function_call"] = completion_obj["tool_calls"][0][
-#                         "function"
-#                     ]
-#                     completion_obj["tool_calls"] = None
-
-#                 self.tool_call = True
-
-#             ## RETURN ARG
-#             return self.return_processed_chunk_logic(
-#                 completion_obj=completion_obj,
-#                 model_response=model_response,  # type: ignore
-#                 response_obj=response_obj,
-#             )
-
-#         except StopIteration:
-#             raise StopIteration
-#         except Exception as e:
-#             traceback.format_exc()
-#             e.message = str(e)
-#             raise exception_type(
-#                 model=self.model,
-#                 custom_llm_provider=self.custom_llm_provider,
-#                 original_exception=e,
-#             )
-
-#     def set_logging_event_loop(self, loop):
-#         """
-#         import litellm, asyncio
-
-#         loop = asyncio.get_event_loop() # 👈 gets the current event loop
-
-#         response = litellm.completion(.., stream=True)
-
-#         response.set_logging_event_loop(loop=loop) # 👈 enables async_success callbacks for sync logging
-
-#         for chunk in response:
-#             ...
-#         """
-#         self.logging_loop = loop
-
-#     def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool):
-#         """
-#         Runs success logging in a thread and adds the response to the cache
-#         """
-#         if litellm.disable_streaming_logging is True:
-#             """
-#             [NOT RECOMMENDED]
-#             Set this via `litellm.disable_streaming_logging = True`.
-
-#             Disables streaming logging.
-#             """
-#             return
-#         ## ASYNC LOGGING
-#         # Create an event loop for the new thread
-#         if self.logging_loop is not None:
-#             future = asyncio.run_coroutine_threadsafe(
-#                 self.logging_obj.async_success_handler(
-#                     processed_chunk, None, None, cache_hit
-#                 ),
-#                 loop=self.logging_loop,
-#             )
-#             future.result()
-#         else:
-#             asyncio.run(
-#                 self.logging_obj.async_success_handler(
-#                     processed_chunk, None, None, cache_hit
-#                 )
-#             )
-#         ## SYNC LOGGING
-#         self.logging_obj.success_handler(processed_chunk, None, None, cache_hit)
-
-#         ## Sync store in cache
-#         if self.logging_obj._llm_caching_handler is not None:
-#             self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache(
-#                 processed_chunk
-#             )
-
-#     def finish_reason_handler(self):
-#         model_response = self.model_response_creator()
-#         complete_streaming_response = litellm.stream_chunk_builder(
-#             chunks=self.chunks
-#         )
-#         _finish_reason = complete_streaming_response.choices[0].finish_reason
-
-#         print(f"_finish_reason: {_finish_reason}")
-#         if _finish_reason is not None:
-#             model_response.choices[0].finish_reason = _finish_reason
-#         else:
-#             model_response.choices[0].finish_reason = "stop"
-
-#         ## if tool use
-#         if (
-#             model_response.choices[0].finish_reason == "stop" and self.tool_call
-#         ):  # don't overwrite for other - potential error finish reasons
-#             model_response.choices[0].finish_reason = "tool_calls"
-#         return model_response
-
-#     def __next__(self):  # noqa: PLR0915
-#         cache_hit = False
-#         if (
-#             self.custom_llm_provider is not None
-#             and self.custom_llm_provider == "cached_response"
-#         ):
-#             cache_hit = True
-#         try:
-#             if self.completion_stream is None:
-#                 self.fetch_sync_stream()
-#             while True:
-#                 if (
-#                     isinstance(self.completion_stream, str)
-#                     or isinstance(self.completion_stream, bytes)
-#                     or isinstance(self.completion_stream, ModelResponse)
-#                 ):
-#                     chunk = self.completion_stream
-#                 else:
-#                     chunk = next(self.completion_stream)
-#                 if chunk is not None and chunk != b"":
-#                     print_verbose(
-#                         f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}"
-#                     )
-#                     response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk)
-#                     print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}")
-
-#                     if response is None:
-#                         continue
-#                     ## LOGGING
-#                     threading.Thread(
-#                         target=self.run_success_logging_and_cache_storage,
-#                         args=(response, cache_hit),
-#                     ).start()  # log response
-#                     choice = response.choices[0]
-#                     if isinstance(choice, StreamingChoices):
-#                         self.response_uptil_now += choice.delta.get("content", "") or ""
-#                     else:
-#                         self.response_uptil_now += ""
-#                     self.rules.post_call_rules(
-#                         input=self.response_uptil_now, model=self.model
-#                     )
-#                     # HANDLE STREAM OPTIONS
-#                     self.chunks.append(response)
-#                     if hasattr(
-#                         response, "usage"
-#                     ):  # remove usage from chunk, only send on final chunk
-#                         # Convert the object to a dictionary
-#                         obj_dict = response.dict()
-
-#                         # Remove an attribute (e.g., 'attr2')
-#                         if "usage" in obj_dict:
-#                             del obj_dict["usage"]
-
-#                         # Create a new object without the removed attribute
-#                         response = self.model_response_creator(
-#                             chunk=obj_dict, hidden_params=response._hidden_params
-#                         )
-#                     # add usage as hidden param
-#                     if self.sent_last_chunk is True and self.stream_options is None:
-#                         usage = calculate_total_usage(chunks=self.chunks)
-#                         response._hidden_params["usage"] = usage
-#                     # RETURN RESULT
-#                     return response
-
-#         except StopIteration:
-#             if self.sent_last_chunk is True:
-#                 complete_streaming_response = litellm.stream_chunk_builder(
-#                     chunks=self.chunks, messages=self.messages
-#                 )
-#                 response = self.model_response_creator()
-#                 if complete_streaming_response is not None:
-#                     setattr(
-#                         response,
-#                         "usage",
-#                         getattr(complete_streaming_response, "usage"),
-#                     )
-
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.logging_obj.success_handler,
-#                     args=(response, None, None, cache_hit),
-#                 ).start()  # log response
-
-#                 if self.sent_stream_usage is False and self.send_stream_usage is True:
-#                     self.sent_stream_usage = True
-#                     return response
-#                 raise  # Re-raise StopIteration
-#             else:
-#                 self.sent_last_chunk = True
-#                 processed_chunk = self.finish_reason_handler()
-#                 if self.stream_options is None:  # add usage as hidden param
-#                     usage = calculate_total_usage(chunks=self.chunks)
-#                     processed_chunk._hidden_params["usage"] = usage
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.run_success_logging_and_cache_storage,
-#                     args=(processed_chunk, cache_hit),
-#                 ).start()  # log response
-#                 return processed_chunk
-#         except Exception as e:
-#             traceback_exception = traceback.format_exc()
-#             # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated
-#             threading.Thread(
-#                 target=self.logging_obj.failure_handler, args=(e, traceback_exception)
-#             ).start()
-#             if isinstance(e, OpenAIError):
-#                 raise e
-#             else:
-#                 raise exception_type(
-#                     model=self.model,
-#                     original_exception=e,
-#                     custom_llm_provider=self.custom_llm_provider,
-#                 )
-
-#     def fetch_sync_stream(self):
-#         if self.completion_stream is None and self.make_call is not None:
-#             # Call make_call to get the completion stream
-#             self.completion_stream = self.make_call(client=litellm.module_level_client)
-#             self._stream_iter = self.completion_stream.__iter__()
-
-#         return self.completion_stream
-
-#     async def fetch_stream(self):
-#         if self.completion_stream is None and self.make_call is not None:
-#             # Call make_call to get the completion stream
-#             self.completion_stream = await self.make_call(
-#                 client=litellm.module_level_aclient
-#             )
-#             self._stream_iter = self.completion_stream.__aiter__()
-
-#         return self.completion_stream
-
-#     async def __anext__(self):  # noqa: PLR0915
-#         cache_hit = False
-#         if (
-#             self.custom_llm_provider is not None
-#             and self.custom_llm_provider == "cached_response"
-#         ):
-#             cache_hit = True
-#         try:
-#             if self.completion_stream is None:
-#                 await self.fetch_stream()
-
-#             if (
-#                 self.custom_llm_provider == "openai"
-#                 or self.custom_llm_provider == "azure"
-#                 or self.custom_llm_provider == "custom_openai"
-#                 or self.custom_llm_provider == "text-completion-openai"
-#                 or self.custom_llm_provider == "text-completion-codestral"
-#                 or self.custom_llm_provider == "azure_text"
-#                 or self.custom_llm_provider == "anthropic"
-#                 or self.custom_llm_provider == "anthropic_text"
-#                 or self.custom_llm_provider == "huggingface"
-#                 or self.custom_llm_provider == "ollama"
-#                 or self.custom_llm_provider == "ollama_chat"
-#                 or self.custom_llm_provider == "vertex_ai"
-#                 or self.custom_llm_provider == "vertex_ai_beta"
-#                 or self.custom_llm_provider == "sagemaker"
-#                 or self.custom_llm_provider == "sagemaker_chat"
-#                 or self.custom_llm_provider == "gemini"
-#                 or self.custom_llm_provider == "replicate"
-#                 or self.custom_llm_provider == "cached_response"
-#                 or self.custom_llm_provider == "predibase"
-#                 or self.custom_llm_provider == "databricks"
-#                 or self.custom_llm_provider == "bedrock"
-#                 or self.custom_llm_provider == "triton"
-#                 or self.custom_llm_provider == "watsonx"
-#                 or self.custom_llm_provider in litellm.openai_compatible_endpoints
-#                 or self.custom_llm_provider in litellm._custom_providers
-#             ):
-#                 async for chunk in self.completion_stream:
-#                     if chunk == "None" or chunk is None:
-#                         raise Exception
-#                     elif (
-#                         self.custom_llm_provider == "gemini"
-#                         and hasattr(chunk, "parts")
-#                         and len(chunk.parts) == 0
-#                     ):
-#                         continue
-#                     # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
-#                     # __anext__ also calls async_success_handler, which does logging
-#                     print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
-
-#                     processed_chunk: Optional[ModelResponse] = self.chunk_creator(
-#                         chunk=chunk
-#                     )
-#                     print_verbose(
-#                         f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}"
-#                     )
-#                     if processed_chunk is None:
-#                         continue
-#                     ## LOGGING
-#                     ## LOGGING
-#                     executor.submit(
-#                         self.logging_obj.success_handler,
-#                         result=processed_chunk,
-#                         start_time=None,
-#                         end_time=None,
-#                         cache_hit=cache_hit,
-#                     )
-
-#                     asyncio.create_task(
-#                         self.logging_obj.async_success_handler(
-#                             processed_chunk, cache_hit=cache_hit
-#                         )
-#                     )
-
-#                     if self.logging_obj._llm_caching_handler is not None:
-#                         asyncio.create_task(
-#                             self.logging_obj._llm_caching_handler._add_streaming_response_to_cache(
-#                                 processed_chunk=processed_chunk,
-#                             )
-#                         )
-
-#                     choice = processed_chunk.choices[0]
-#                     if isinstance(choice, StreamingChoices):
-#                         self.response_uptil_now += choice.delta.get("content", "") or ""
-#                     else:
-#                         self.response_uptil_now += ""
-#                     self.rules.post_call_rules(
-#                         input=self.response_uptil_now, model=self.model
-#                     )
-#                     self.chunks.append(processed_chunk)
-#                     if hasattr(
-#                         processed_chunk, "usage"
-#                     ):  # remove usage from chunk, only send on final chunk
-#                         # Convert the object to a dictionary
-#                         obj_dict = processed_chunk.dict()
-
-#                         # Remove an attribute (e.g., 'attr2')
-#                         if "usage" in obj_dict:
-#                             del obj_dict["usage"]
-
-#                         # Create a new object without the removed attribute
-#                         processed_chunk = self.model_response_creator(chunk=obj_dict)
-#                     print_verbose(f"final returned processed chunk: {processed_chunk}")
-#                     return processed_chunk
-#                 raise StopAsyncIteration
-#             else:  # temporary patch for non-aiohttp async calls
-#                 # example - boto3 bedrock llms
-#                 while True:
-#                     if isinstance(self.completion_stream, str) or isinstance(
-#                         self.completion_stream, bytes
-#                     ):
-#                         chunk = self.completion_stream
-#                     else:
-#                         chunk = next(self.completion_stream)
-#                     if chunk is not None and chunk != b"":
-#                         print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
-#                         processed_chunk: Optional[ModelResponse] = self.chunk_creator(
-#                             chunk=chunk
-#                         )
-#                         print_verbose(
-#                             f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
-#                         )
-#                         if processed_chunk is None:
-#                             continue
-#                         ## LOGGING
-#                         threading.Thread(
-#                             target=self.logging_obj.success_handler,
-#                             args=(processed_chunk, None, None, cache_hit),
-#                         ).start()  # log processed_chunk
-#                         asyncio.create_task(
-#                             self.logging_obj.async_success_handler(
-#                                 processed_chunk, cache_hit=cache_hit
-#                             )
-#                         )
-
-#                         choice = processed_chunk.choices[0]
-#                         if isinstance(choice, StreamingChoices):
-#                             self.response_uptil_now += (
-#                                 choice.delta.get("content", "") or ""
-#                             )
-#                         else:
-#                             self.response_uptil_now += ""
-#                         self.rules.post_call_rules(
-#                             input=self.response_uptil_now, model=self.model
-#                         )
-#                         # RETURN RESULT
-#                         self.chunks.append(processed_chunk)
-#                         return processed_chunk
-#         except (StopAsyncIteration, StopIteration):
-#             if self.sent_last_chunk is True:
-#                 # log the final chunk with accurate streaming values
-#                 complete_streaming_response = litellm.stream_chunk_builder(
-#                     chunks=self.chunks, messages=self.messages
-#                 )
-#                 response = self.model_response_creator()
-#                 if complete_streaming_response is not None:
-#                     setattr(
-#                         response,
-#                         "usage",
-#                         getattr(complete_streaming_response, "usage"),
-#                     )
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.logging_obj.success_handler,
-#                     args=(response, None, None, cache_hit),
-#                 ).start()  # log response
-#                 asyncio.create_task(
-#                     self.logging_obj.async_success_handler(
-#                         response, cache_hit=cache_hit
-#                     )
-#                 )
-#                 if self.sent_stream_usage is False and self.send_stream_usage is True:
-#                     self.sent_stream_usage = True
-#                     return response
-#                 raise StopAsyncIteration  # Re-raise StopIteration
-#             else:
-#                 self.sent_last_chunk = True
-#                 processed_chunk = self.finish_reason_handler()
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.logging_obj.success_handler,
-#                     args=(processed_chunk, None, None, cache_hit),
-#                 ).start()  # log response
-#                 asyncio.create_task(
-#                     self.logging_obj.async_success_handler(
-#                         processed_chunk, cache_hit=cache_hit
-#                     )
-#                 )
-#                 return processed_chunk
-#         except httpx.TimeoutException as e:  # if httpx read timeout error occues
-#             traceback_exception = traceback.format_exc()
-#             ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT
-#             traceback_exception += "\nLiteLLM Default Request Timeout - {}".format(
-#                 litellm.request_timeout
-#             )
-#             if self.logging_obj is not None:
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.logging_obj.failure_handler,
-#                     args=(e, traceback_exception),
-#                 ).start()  # log response
-#                 # Handle any exceptions that might occur during streaming
-#                 asyncio.create_task(
-#                     self.logging_obj.async_failure_handler(e, traceback_exception)
-#                 )
-#             raise e
-#         except Exception as e:
-#             traceback_exception = traceback.format_exc()
-#             if self.logging_obj is not None:
-#                 ## LOGGING
-#                 threading.Thread(
-#                     target=self.logging_obj.failure_handler,
-#                     args=(e, traceback_exception),
-#                 ).start()  # log response
-#                 # Handle any exceptions that might occur during streaming
-#                 asyncio.create_task(
-#                     self.logging_obj.async_failure_handler(e, traceback_exception)  # type: ignore
-#                 )
-#             ## Map to OpenAI Exception
-#             raise exception_type(
-#                 model=self.model,
-#                 custom_llm_provider=self.custom_llm_provider,
-#                 original_exception=e,
-#                 completion_kwargs={},
-#                 extra_kwargs={},
-#             )
-
-
 class TextCompletionStreamWrapper:
     def __init__(
         self,
@@ -7977,7 +5869,6 @@ def get_valid_models() -> List[str]:
             if expected_provider_key in environ_keys:
                 # key is set
                 valid_providers.append(provider)
-
         for provider in valid_providers:
             if provider == "azure":
                 valid_models.append("Azure-LLM")
@@ -8253,10 +6144,13 @@ def validate_chat_completion_user_messages(messages: List[AllMessageValues]):
                             if isinstance(item, dict):
                                 if item.get("type") not in ValidUserMessageContentTypes:
                                     raise Exception("invalid content type")
-        except Exception:
-            raise Exception(
-                f"Invalid user message={m} at index {idx}. Please ensure all user messages are valid OpenAI chat completion messages."
-            )
+        except Exception as e:
+            if "invalid content type" in str(e):
+                raise Exception(
+                    f"Invalid user message={m} at index {idx}. Please ensure all user messages are valid OpenAI chat completion messages."
+                )
+            else:
+                raise e
 
     return messages
 
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index e8aeac2cb..48b25523e 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1898,7 +1898,8 @@
         "supports_function_calling": true,
         "tool_use_system_prompt_tokens": 264,
         "supports_assistant_prefill": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_pdf_input": true
     },
     "claude-3-opus-20240229": {
         "max_tokens": 4096,
diff --git a/tests/llm_translation/base_llm_unit_tests.py b/tests/llm_translation/base_llm_unit_tests.py
index 4f9cd9c25..96004eb4e 100644
--- a/tests/llm_translation/base_llm_unit_tests.py
+++ b/tests/llm_translation/base_llm_unit_tests.py
@@ -44,3 +44,30 @@ class BaseLLMChatTest(ABC):
             messages=messages,
         )
         assert response is not None
+
+    @pytest.fixture
+    def pdf_messages(self):
+        import base64
+
+        import requests
+
+        # URL of the file
+        url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf"
+
+        response = requests.get(url)
+        file_data = response.content
+
+        encoded_file = base64.b64encode(file_data).decode("utf-8")
+        url = f"data:application/pdf;base64,{encoded_file}"
+
+        image_content = [
+            {"type": "text", "text": "What's this file about?"},
+            {
+                "type": "image_url",
+                "image_url": {"url": url},
+            },
+        ]
+
+        image_messages = [{"role": "user", "content": image_content}]
+
+        return image_messages
diff --git a/tests/llm_translation/test_anthropic_completion.py b/tests/llm_translation/test_anthropic_completion.py
index 46f01e0ec..9d7c9af73 100644
--- a/tests/llm_translation/test_anthropic_completion.py
+++ b/tests/llm_translation/test_anthropic_completion.py
@@ -36,6 +36,7 @@ from litellm.types.llms.anthropic import AnthropicResponse
 
 from litellm.llms.anthropic.common_utils import process_anthropic_headers
 from httpx import Headers
+from base_llm_unit_tests import BaseLLMChatTest
 
 
 def test_anthropic_completion_messages_translation():
@@ -624,3 +625,40 @@ def test_anthropic_tool_helper(cache_control_location):
     tool = AnthropicConfig()._map_tool_helper(tool=tool)
 
     assert tool["cache_control"] == {"type": "ephemeral"}
+
+
+from litellm import completion
+
+
+class TestAnthropicCompletion(BaseLLMChatTest):
+    def get_base_completion_call_args(self) -> dict:
+        return {"model": "claude-3-haiku-20240307"}
+
+    def test_pdf_handling(self, pdf_messages):
+        from litellm.llms.custom_httpx.http_handler import HTTPHandler
+        from litellm.types.llms.anthropic import AnthropicMessagesDocumentParam
+        import json
+
+        client = HTTPHandler()
+
+        with patch.object(client, "post", new=MagicMock()) as mock_client:
+            response = completion(
+                model="claude-3-5-sonnet-20241022",
+                messages=pdf_messages,
+                client=client,
+            )
+
+            mock_client.assert_called_once()
+
+            json_data = json.loads(mock_client.call_args.kwargs["data"])
+            headers = mock_client.call_args.kwargs["headers"]
+
+            assert headers["anthropic-beta"] == "pdfs-2024-09-25"
+
+            json_data["messages"][0]["role"] == "user"
+            _document_validation = AnthropicMessagesDocumentParam(
+                **json_data["messages"][0]["content"][1]
+            )
+            assert _document_validation["type"] == "document"
+            assert _document_validation["source"]["media_type"] == "application/pdf"
+            assert _document_validation["source"]["type"] == "base64"
diff --git a/tests/local_testing/test_get_llm_provider.py b/tests/local_testing/test_get_llm_provider.py
index f7126cec0..6654c10c2 100644
--- a/tests/local_testing/test_get_llm_provider.py
+++ b/tests/local_testing/test_get_llm_provider.py
@@ -169,3 +169,11 @@ def test_get_llm_provider_hosted_vllm():
     assert custom_llm_provider == "hosted_vllm"
     assert model == "llama-3.1-70b-instruct"
     assert dynamic_api_key == ""
+
+
+def test_get_llm_provider_watson_text():
+    model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
+        model="watsonx_text/watson-text-to-speech",
+    )
+    assert custom_llm_provider == "watsonx_text"
+    assert model == "watson-text-to-speech"
diff --git a/tests/local_testing/test_get_model_list.py b/tests/local_testing/test_get_model_list.py
deleted file mode 100644
index 7663eebf5..000000000
--- a/tests/local_testing/test_get_model_list.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import os, sys, traceback
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import litellm
-from litellm import get_model_list
-
-print(get_model_list())
-print(get_model_list())
-# print(litellm.model_list)
diff --git a/tests/local_testing/test_opentelemetry_unit_tests.py b/tests/local_testing/test_opentelemetry_unit_tests.py
deleted file mode 100644
index 530adc6ab..000000000
--- a/tests/local_testing/test_opentelemetry_unit_tests.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# What is this?
-## Unit tests for opentelemetry integration
-
-# What is this?
-## Unit test for presidio pii masking
-import sys, os, asyncio, time, random
-from datetime import datetime
-import traceback
-from dotenv import load_dotenv
-
-load_dotenv()
-import os
-import asyncio
-
-sys.path.insert(
-    0, os.path.abspath("../..")
-)  # Adds the parent directory to the system path
-import pytest
-import litellm
-from unittest.mock import patch, MagicMock, AsyncMock
-
-
-@pytest.mark.asyncio
-async def test_opentelemetry_integration():
-    """
-    Unit test to confirm the parent otel span is ended
-    """
-
-    parent_otel_span = MagicMock()
-    litellm.callbacks = ["otel"]
-
-    await litellm.acompletion(
-        model="gpt-3.5-turbo",
-        messages=[{"role": "user", "content": "Hello, world!"}],
-        mock_response="Hey!",
-        metadata={"litellm_parent_otel_span": parent_otel_span},
-    )
-
-    await asyncio.sleep(1)
-
-    parent_otel_span.end.assert_called_once()
diff --git a/tests/local_testing/test_utils.py b/tests/local_testing/test_utils.py
index 5aa3b610c..b3f8208bf 100644
--- a/tests/local_testing/test_utils.py
+++ b/tests/local_testing/test_utils.py
@@ -943,3 +943,24 @@ def test_validate_chat_completion_user_messages(messages, expected_bool):
         ## Invalid message
         with pytest.raises(Exception):
             validate_chat_completion_user_messages(messages=messages)
+
+
+def test_models_by_provider():
+    """
+    Make sure all providers from model map are in the valid providers list
+    """
+    from litellm import models_by_provider
+
+    providers = set()
+    for k, v in litellm.model_cost.items():
+        if "_" in v["litellm_provider"] and "-" in v["litellm_provider"]:
+            continue
+        elif k == "sample_spec":
+            continue
+        elif v["litellm_provider"] == "sagemaker":
+            continue
+        else:
+            providers.add(v["litellm_provider"])
+
+    for provider in providers:
+        assert provider in models_by_provider.keys()
diff --git a/tests/logging_callback_tests/base_test.py b/tests/logging_callback_tests/base_test.py
new file mode 100644
index 000000000..0d1e7dfcf
--- /dev/null
+++ b/tests/logging_callback_tests/base_test.py
@@ -0,0 +1,100 @@
+import asyncio
+import httpx
+import json
+import pytest
+import sys
+from typing import Any, Dict, List
+from unittest.mock import MagicMock, Mock, patch
+import os
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm.exceptions import BadRequestError
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.utils import CustomStreamWrapper
+from litellm.types.utils import ModelResponse
+
+# test_example.py
+from abc import ABC, abstractmethod
+
+
+class BaseLoggingCallbackTest(ABC):
+    """
+    Abstract base test class that enforces a common test across all test classes.
+    """
+
+    @pytest.fixture
+    def mock_response_obj(self):
+        from litellm.types.utils import (
+            ModelResponse,
+            Choices,
+            Message,
+            ChatCompletionMessageToolCall,
+            Function,
+            Usage,
+            CompletionTokensDetailsWrapper,
+            PromptTokensDetailsWrapper,
+        )
+
+        # Create a mock response object with the structure you need
+        return ModelResponse(
+            id="chatcmpl-ASId3YJWagBpBskWfoNEMPFSkmrEw",
+            created=1731308157,
+            model="gpt-4o-mini-2024-07-18",
+            object="chat.completion",
+            system_fingerprint="fp_0ba0d124f1",
+            choices=[
+                Choices(
+                    finish_reason="tool_calls",
+                    index=0,
+                    message=Message(
+                        content=None,
+                        role="assistant",
+                        tool_calls=[
+                            ChatCompletionMessageToolCall(
+                                function=Function(
+                                    arguments='{"city": "New York"}', name="get_weather"
+                                ),
+                                id="call_PngsQS5YGmIZKnswhnUOnOVb",
+                                type="function",
+                            ),
+                            ChatCompletionMessageToolCall(
+                                function=Function(
+                                    arguments='{"city": "New York"}', name="get_news"
+                                ),
+                                id="call_1zsDThBu0VSK7KuY7eCcJBnq",
+                                type="function",
+                            ),
+                        ],
+                        function_call=None,
+                    ),
+                )
+            ],
+            usage=Usage(
+                completion_tokens=46,
+                prompt_tokens=86,
+                total_tokens=132,
+                completion_tokens_details=CompletionTokensDetailsWrapper(
+                    accepted_prediction_tokens=0,
+                    audio_tokens=0,
+                    reasoning_tokens=0,
+                    rejected_prediction_tokens=0,
+                    text_tokens=None,
+                ),
+                prompt_tokens_details=PromptTokensDetailsWrapper(
+                    audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None
+                ),
+            ),
+            service_tier=None,
+        )
+
+    @abstractmethod
+    def test_parallel_tool_calls(self, mock_response_obj: ModelResponse):
+        """
+        Check if parallel tool calls are correctly logged by Logging callback
+
+        Relevant issue - https://github.com/BerriAI/litellm/issues/6677
+        """
+        pass
diff --git a/tests/logging_callback_tests/test_opentelemetry_unit_tests.py b/tests/logging_callback_tests/test_opentelemetry_unit_tests.py
new file mode 100644
index 000000000..b0d09562c
--- /dev/null
+++ b/tests/logging_callback_tests/test_opentelemetry_unit_tests.py
@@ -0,0 +1,58 @@
+# What is this?
+## Unit tests for opentelemetry integration
+
+# What is this?
+## Unit test for presidio pii masking
+import sys, os, asyncio, time, random
+from datetime import datetime
+import traceback
+from dotenv import load_dotenv
+
+load_dotenv()
+import os
+import asyncio
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import pytest
+import litellm
+from unittest.mock import patch, MagicMock, AsyncMock
+from base_test import BaseLoggingCallbackTest
+from litellm.types.utils import ModelResponse
+
+
+class TestOpentelemetryUnitTests(BaseLoggingCallbackTest):
+    def test_parallel_tool_calls(self, mock_response_obj: ModelResponse):
+        tool_calls = mock_response_obj.choices[0].message.tool_calls
+        from litellm.integrations.opentelemetry import OpenTelemetry
+        from litellm.proxy._types import SpanAttributes
+
+        kv_pair_dict = OpenTelemetry._tool_calls_kv_pair(tool_calls)
+
+        assert kv_pair_dict == {
+            f"{SpanAttributes.LLM_COMPLETIONS}.0.function_call.arguments": '{"city": "New York"}',
+            f"{SpanAttributes.LLM_COMPLETIONS}.0.function_call.name": "get_weather",
+            f"{SpanAttributes.LLM_COMPLETIONS}.1.function_call.arguments": '{"city": "New York"}',
+            f"{SpanAttributes.LLM_COMPLETIONS}.1.function_call.name": "get_news",
+        }
+
+    @pytest.mark.asyncio
+    async def test_opentelemetry_integration(self):
+        """
+        Unit test to confirm the parent otel span is ended
+        """
+
+        parent_otel_span = MagicMock()
+        litellm.callbacks = ["otel"]
+
+        await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            mock_response="Hey!",
+            metadata={"litellm_parent_otel_span": parent_otel_span},
+        )
+
+        await asyncio.sleep(1)
+
+        parent_otel_span.end.assert_called_once()