From d37c8b5c6bf42aa63c65ba5e185c2cf9b1582e14 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Tue, 24 Sep 2024 15:01:31 -0700
Subject: [PATCH] LiteLLM Minor Fixes & Improvements (09/23/2024)  (#5842)
 (#5858)

* LiteLLM Minor Fixes & Improvements (09/23/2024)  (#5842)

* feat(auth_utils.py): enable admin to allow client-side credentials to be passed

Makes it easier for devs to experiment with finetuned fireworks ai models

* feat(router.py): allow setting configurable_clientside_auth_params for a model

Closes https://github.com/BerriAI/litellm/issues/5843

* build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit

Fixes https://github.com/BerriAI/litellm/issues/5850

* fix(azure_ai/): support content list for azure ai

Fixes https://github.com/BerriAI/litellm/issues/4237

* fix(litellm_logging.py): always set saved_cache_cost

Set to 0 by default

* fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing

handles calling 405b+ size models

* fix(slack_alerting.py): fix error alerting for failed spend tracking

Fixes regression with slack alerting error monitoring

* fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error

* docs(bedrock.md): add llama3-1 models

* test: fix tests

* fix(azure_ai/chat): fix transformation for azure ai calls
---
 docs/my-website/docs/providers/bedrock.md     |   3 +
 litellm/__init__.py                           |   2 +-
 .../SlackAlerting/slack_alerting.py           |  19 +-
 litellm/litellm_core_utils/litellm_logging.py |   9 +-
 litellm/llms/OpenAI/openai.py                 |  19 --
 litellm/llms/azure_ai/chat/handler.py         |  59 ++++
 litellm/llms/azure_ai/chat/transformation.py  |  31 ++
 litellm/llms/fireworks_ai/cost_calculator.py  |  16 +-
 litellm/llms/prompt_templates/common_utils.py |  32 ++
 .../vertex_and_google_ai_studio_gemini.py     | 139 ++++----
 litellm/main.py                               |   6 +-
 ...odel_prices_and_context_window_backup.json |  21 +-
 litellm/proxy/_new_secret_config.yaml         |   7 +
 litellm/proxy/auth/auth_utils.py              |  63 +++-
 litellm/proxy/route_llm_request.py            |   2 +-
 .../spend_tracking/spend_tracking_utils.py    |   4 +
 litellm/proxy/utils.py                        |  26 +-
 litellm/router.py                             | 304 ++++++++++--------
 litellm/tests/test_completion.py              |  11 +-
 litellm/tests/test_completion_cost.py         |  25 +-
 litellm/tests/test_proxy_utils.py             |  76 +++++
 litellm/types/llms/vertex_ai.py               |   2 +-
 litellm/types/router.py                       |   5 +
 litellm/types/utils.py                        |   3 +-
 model_prices_and_context_window.json          |  21 +-
 25 files changed, 611 insertions(+), 294 deletions(-)
 create mode 100644 litellm/llms/azure_ai/chat/handler.py
 create mode 100644 litellm/llms/azure_ai/chat/transformation.py
 create mode 100644 litellm/llms/prompt_templates/common_utils.py

diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md
index 7ceedf6ef..6548714b2 100644
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@@ -987,6 +987,9 @@ Here's an example of using a bedrock model with LiteLLM. For a complete list, re
 | Anthropic Claude-V2.1      | `completion(model='bedrock/anthropic.claude-v2:1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V2        | `completion(model='bedrock/anthropic.claude-v2', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-Instant V1 | `completion(model='bedrock/anthropic.claude-instant-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
+| Meta llama3-1-405b        | `completion(model='bedrock/meta.llama3-1-405b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
+| Meta llama3-1-70b        | `completion(model='bedrock/meta.llama3-1-70b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
+| Meta llama3-1-8b        | `completion(model='bedrock/meta.llama3-1-8b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-70b        | `completion(model='bedrock/meta.llama3-70b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-8b | `completion(model='bedrock/meta.llama3-8b-instruct-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Amazon Titan Lite          | `completion(model='bedrock/amazon.titan-text-lite-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 816416e38..a94ba534b 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -963,8 +963,8 @@ from .llms.OpenAI.openai import (
     MistralEmbeddingConfig,
     DeepInfraConfig,
     GroqConfig,
-    AzureAIStudioConfig,
 )
+from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.OpenAI.chat.o1_transformation import (
     OpenAIO1Config,
diff --git a/litellm/integrations/SlackAlerting/slack_alerting.py b/litellm/integrations/SlackAlerting/slack_alerting.py
index 7b2f75b9e..48fec5fc0 100644
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@@ -10,7 +10,7 @@ import traceback
 from datetime import datetime as dt
 from datetime import timedelta, timezone
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Set, TypedDict, Union
+from typing import Any, Dict, List, Literal, Optional, Set, TypedDict, Union, get_args
 
 import aiohttp
 import dotenv
@@ -57,20 +57,7 @@ class SlackAlerting(CustomBatchLogger):
             float
         ] = None,  # threshold for slow / hanging llm responses (in seconds)
         alerting: Optional[List] = [],
-        alert_types: List[AlertType] = [
-            "llm_exceptions",
-            "llm_too_slow",
-            "llm_requests_hanging",
-            "budget_alerts",
-            "db_exceptions",
-            "daily_reports",
-            "spend_reports",
-            "fallback_reports",
-            "cooldown_deployment",
-            "new_model_added",
-            "outage_alerts",
-            "failed_tracking_spend",
-        ],
+        alert_types: List[AlertType] = list(get_args(AlertType)),
         alert_to_webhook_url: Optional[
             Dict[AlertType, Union[List[str], str]]
         ] = None,  # if user wants to separate alerts to diff channels
@@ -613,7 +600,7 @@ class SlackAlerting(CustomBatchLogger):
             await self.send_alert(
                 message=message,
                 level="High",
-                alert_type="budget_alerts",
+                alert_type="failed_tracking_spend",
                 alerting_metadata={},
             )
             await _cache.async_set_cache(
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index 44de32b15..1219feac9 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -2498,13 +2498,16 @@ def get_standard_logging_object_payload(
         else:
             cache_key = None
 
-        saved_cache_cost: Optional[float] = None
+        saved_cache_cost: float = 0.0
         if cache_hit is True:
 
             id = f"{id}_cache_hit{time.time()}"  # do not duplicate the request id
 
-            saved_cache_cost = logging_obj._response_cost_calculator(
-                result=init_response_obj, cache_hit=False  # type: ignore
+            saved_cache_cost = (
+                logging_obj._response_cost_calculator(
+                    result=init_response_obj, cache_hit=False  # type: ignore
+                )
+                or 0.0
             )
 
         ## Get model cost information ##
diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py
index c67968a5d..87aa095ab 100644
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@@ -103,25 +103,6 @@ class MistralEmbeddingConfig:
         return optional_params
 
 
-class AzureAIStudioConfig:
-    def get_required_params(self) -> List[ProviderField]:
-        """For a given provider, return it's required fields with a description"""
-        return [
-            ProviderField(
-                field_name="api_key",
-                field_type="string",
-                field_description="Your Azure AI Studio API Key.",
-                field_value="zEJ...",
-            ),
-            ProviderField(
-                field_name="api_base",
-                field_type="string",
-                field_description="Your Azure AI Studio API Base.",
-                field_value="https://Mistral-serverless.",
-            ),
-        ]
-
-
 class DeepInfraConfig:
     """
     Reference: https://deepinfra.com/docs/advanced/openai_api
diff --git a/litellm/llms/azure_ai/chat/handler.py b/litellm/llms/azure_ai/chat/handler.py
new file mode 100644
index 000000000..ce270d8f6
--- /dev/null
+++ b/litellm/llms/azure_ai/chat/handler.py
@@ -0,0 +1,59 @@
+from typing import Any, Callable, List, Optional, Union
+
+from httpx._config import Timeout
+
+from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
+from litellm.llms.OpenAI.openai import OpenAIChatCompletion
+from litellm.types.utils import ModelResponse
+from litellm.utils import CustomStreamWrapper
+
+from .transformation import AzureAIStudioConfig
+
+
+class AzureAIChatCompletion(OpenAIChatCompletion):
+    def completion(
+        self,
+        model_response: ModelResponse,
+        timeout: Union[float, Timeout],
+        optional_params: dict,
+        logging_obj: Any,
+        model: Optional[str] = None,
+        messages: Optional[list] = None,
+        print_verbose: Optional[Callable[..., Any]] = None,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        acompletion: bool = False,
+        litellm_params=None,
+        logger_fn=None,
+        headers: Optional[dict] = None,
+        custom_prompt_dict: dict = {},
+        client=None,
+        organization: Optional[str] = None,
+        custom_llm_provider: Optional[str] = None,
+        drop_params: Optional[bool] = None,
+    ):
+
+        transformed_messages = AzureAIStudioConfig()._transform_messages(
+            messages=messages  # type: ignore
+        )
+
+        return super().completion(
+            model_response,
+            timeout,
+            optional_params,
+            logging_obj,
+            model,
+            transformed_messages,
+            print_verbose,
+            api_key,
+            api_base,
+            acompletion,
+            litellm_params,
+            logger_fn,
+            headers,
+            custom_prompt_dict,
+            client,
+            organization,
+            custom_llm_provider,
+            drop_params,
+        )
diff --git a/litellm/llms/azure_ai/chat/transformation.py b/litellm/llms/azure_ai/chat/transformation.py
new file mode 100644
index 000000000..85f107eca
--- /dev/null
+++ b/litellm/llms/azure_ai/chat/transformation.py
@@ -0,0 +1,31 @@
+from typing import List
+
+from litellm.llms.OpenAI.openai import OpenAIConfig
+from litellm.llms.prompt_templates.common_utils import convert_content_list_to_str
+from litellm.types.llms.openai import AllMessageValues
+from litellm.types.utils import ProviderField
+
+
+class AzureAIStudioConfig(OpenAIConfig):
+    def get_required_params(self) -> List[ProviderField]:
+        """For a given provider, return it's required fields with a description"""
+        return [
+            ProviderField(
+                field_name="api_key",
+                field_type="string",
+                field_description="Your Azure AI Studio API Key.",
+                field_value="zEJ...",
+            ),
+            ProviderField(
+                field_name="api_base",
+                field_type="string",
+                field_description="Your Azure AI Studio API Base.",
+                field_value="https://Mistral-serverless.",
+            ),
+        ]
+
+    def _transform_messages(self, messages: List[AllMessageValues]) -> List:
+        for message in messages:
+            message = convert_content_list_to_str(message=message)
+
+        return messages
diff --git a/litellm/llms/fireworks_ai/cost_calculator.py b/litellm/llms/fireworks_ai/cost_calculator.py
index 83ce97d4c..f53aba4a4 100644
--- a/litellm/llms/fireworks_ai/cost_calculator.py
+++ b/litellm/llms/fireworks_ai/cost_calculator.py
@@ -10,7 +10,7 @@ from litellm.utils import get_model_info
 
 # Extract the number of billion parameters from the model name
 # only used for together_computer LLMs
-def get_model_params_and_category(model_name: str) -> str:
+def get_base_model_for_pricing(model_name: str) -> str:
     """
     Helper function for calculating together ai pricing.
 
@@ -43,7 +43,7 @@ def get_model_params_and_category(model_name: str) -> str:
             return "fireworks-ai-16b-80b"
 
     # If no matches, return the original model_name
-    return model_name
+    return "fireworks-ai-default"
 
 
 def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
@@ -57,10 +57,16 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
     Returns:
         Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
     """
-    base_model = get_model_params_and_category(model_name=model)
+    ## check if model mapped, else use default pricing
+    try:
+        model_info = get_model_info(model=model, custom_llm_provider="fireworks_ai")
+    except Exception:
+        base_model = get_base_model_for_pricing(model_name=model)
 
-    ## GET MODEL INFO
-    model_info = get_model_info(model=base_model, custom_llm_provider="fireworks_ai")
+        ## GET MODEL INFO
+        model_info = get_model_info(
+            model=base_model, custom_llm_provider="fireworks_ai"
+        )
 
     ## CALCULATE INPUT COST
 
diff --git a/litellm/llms/prompt_templates/common_utils.py b/litellm/llms/prompt_templates/common_utils.py
new file mode 100644
index 000000000..e32ae3709
--- /dev/null
+++ b/litellm/llms/prompt_templates/common_utils.py
@@ -0,0 +1,32 @@
+"""
+Common utility functions used for translating messages across providers
+"""
+
+from typing import List
+
+from litellm.types.llms.openai import AllMessageValues
+
+
+def convert_content_list_to_str(message: AllMessageValues) -> AllMessageValues:
+    """
+    - handles scenario where content is list and not string
+    - content list is just text, and no images
+    - if image passed in, then just return as is (user-intended)
+
+    Motivation: mistral api + azure ai don't support content as a list
+    """
+    texts = ""
+    message_content = message.get("content")
+    if message_content:
+        if message_content is not None and isinstance(message_content, list):
+            for c in message_content:
+                text_content = c.get("text")
+                if text_content:
+                    texts += text_content
+        elif message_content is not None and isinstance(message_content, str):
+            texts = message_content
+
+    if texts:
+        message["content"] = texts
+
+    return message
diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
index 858336bf0..35a7e8337 100644
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
@@ -49,6 +49,7 @@ from litellm.types.llms.openai import (
     ChatCompletionUsageBlock,
 )
 from litellm.types.llms.vertex_ai import (
+    Candidates,
     ContentType,
     FunctionCallingConfig,
     FunctionDeclaration,
@@ -187,7 +188,11 @@ class VertexAIConfig:
                     optional_params["stop_sequences"] = value
             if param == "max_tokens" or param == "max_completion_tokens":
                 optional_params["max_output_tokens"] = value
-            if param == "response_format" and value["type"] == "json_object":
+            if (
+                param == "response_format"
+                and isinstance(value, dict)
+                and value["type"] == "json_object"
+            ):
                 optional_params["response_mime_type"] = "application/json"
             if param == "frequency_penalty":
                 optional_params["frequency_penalty"] = value
@@ -900,14 +905,14 @@ class VertexLLM(VertexBase):
 
                 return model_response
 
-        if len(completion_response["candidates"]) > 0:
+        _candidates = completion_response.get("candidates")
+        if _candidates and len(_candidates) > 0:
             content_policy_violations = (
                 VertexGeminiConfig().get_flagged_finish_reasons()
             )
             if (
-                "finishReason" in completion_response["candidates"][0]
-                and completion_response["candidates"][0]["finishReason"]
-                in content_policy_violations.keys()
+                "finishReason" in _candidates[0]
+                and _candidates[0]["finishReason"] in content_policy_violations.keys()
             ):
                 ## CONTENT POLICY VIOLATION ERROR
                 model_response.choices[0].finish_reason = "content_filter"
@@ -956,55 +961,58 @@ class VertexLLM(VertexBase):
             content_str = ""
             tools: List[ChatCompletionToolCallChunk] = []
             functions: Optional[ChatCompletionToolCallFunctionChunk] = None
-            for idx, candidate in enumerate(completion_response["candidates"]):
-                if "content" not in candidate:
-                    continue
+            if _candidates:
+                for idx, candidate in enumerate(_candidates):
+                    if "content" not in candidate:
+                        continue
 
-                if "groundingMetadata" in candidate:
-                    grounding_metadata.append(candidate["groundingMetadata"])
+                    if "groundingMetadata" in candidate:
+                        grounding_metadata.append(candidate["groundingMetadata"])  # type: ignore
 
-                if "safetyRatings" in candidate:
-                    safety_ratings.append(candidate["safetyRatings"])
+                    if "safetyRatings" in candidate:
+                        safety_ratings.append(candidate["safetyRatings"])
 
-                if "citationMetadata" in candidate:
-                    citation_metadata.append(candidate["citationMetadata"])
-                if "text" in candidate["content"]["parts"][0]:
-                    content_str = candidate["content"]["parts"][0]["text"]
+                    if "citationMetadata" in candidate:
+                        citation_metadata.append(candidate["citationMetadata"])
+                    if "text" in candidate["content"]["parts"][0]:
+                        content_str = candidate["content"]["parts"][0]["text"]
 
-                if "functionCall" in candidate["content"]["parts"][0]:
-                    _function_chunk = ChatCompletionToolCallFunctionChunk(
-                        name=candidate["content"]["parts"][0]["functionCall"]["name"],
-                        arguments=json.dumps(
-                            candidate["content"]["parts"][0]["functionCall"]["args"]
-                        ),
-                    )
-                    if litellm_params.get("litellm_param_is_function_call") is True:
-                        functions = _function_chunk
-                    else:
-                        _tool_response_chunk = ChatCompletionToolCallChunk(
-                            id=f"call_{str(uuid.uuid4())}",
-                            type="function",
-                            function=_function_chunk,
-                            index=candidate.get("index", idx),
+                    if "functionCall" in candidate["content"]["parts"][0]:
+                        _function_chunk = ChatCompletionToolCallFunctionChunk(
+                            name=candidate["content"]["parts"][0]["functionCall"][
+                                "name"
+                            ],
+                            arguments=json.dumps(
+                                candidate["content"]["parts"][0]["functionCall"]["args"]
+                            ),
                         )
-                        tools.append(_tool_response_chunk)
-                chat_completion_message["content"] = (
-                    content_str if len(content_str) > 0 else None
-                )
-                if len(tools) > 0:
-                    chat_completion_message["tool_calls"] = tools
+                        if litellm_params.get("litellm_param_is_function_call") is True:
+                            functions = _function_chunk
+                        else:
+                            _tool_response_chunk = ChatCompletionToolCallChunk(
+                                id=f"call_{str(uuid.uuid4())}",
+                                type="function",
+                                function=_function_chunk,
+                                index=candidate.get("index", idx),
+                            )
+                            tools.append(_tool_response_chunk)
+                    chat_completion_message["content"] = (
+                        content_str if len(content_str) > 0 else None
+                    )
+                    if len(tools) > 0:
+                        chat_completion_message["tool_calls"] = tools
 
-                if functions is not None:
-                    chat_completion_message["function_call"] = functions
-                choice = litellm.Choices(
-                    finish_reason=candidate.get("finishReason", "stop"),
-                    index=candidate.get("index", idx),
-                    message=chat_completion_message,  # type: ignore
-                    logprobs=None,
-                    enhancements=None,
-                )
+                    if functions is not None:
+                        chat_completion_message["function_call"] = functions
+                    choice = litellm.Choices(
+                        finish_reason=candidate.get("finishReason", "stop"),
+                        index=candidate.get("index", idx),
+                        message=chat_completion_message,  # type: ignore
+                        logprobs=None,
+                        enhancements=None,
+                    )
 
-                model_response.choices.append(choice)
+                    model_response.choices.append(choice)
 
             ## GET USAGE ##
             usage = litellm.Usage(
@@ -1433,10 +1441,12 @@ class ModelResponseIterator:
             is_finished = False
             finish_reason = ""
             usage: Optional[ChatCompletionUsageBlock] = None
+            _candidates: Optional[List[Candidates]] = processed_chunk.get("candidates")
+            gemini_chunk: Optional[Candidates] = None
+            if _candidates and len(_candidates) > 0:
+                gemini_chunk = _candidates[0]
 
-            gemini_chunk = processed_chunk["candidates"][0]
-
-            if "content" in gemini_chunk:
+            if gemini_chunk and "content" in gemini_chunk:
                 if "text" in gemini_chunk["content"]["parts"][0]:
                     text = gemini_chunk["content"]["parts"][0]["text"]
                 elif "functionCall" in gemini_chunk["content"]["parts"][0]:
@@ -1455,7 +1465,7 @@ class ModelResponseIterator:
                         index=0,
                     )
 
-            if "finishReason" in gemini_chunk:
+            if gemini_chunk and "finishReason" in gemini_chunk:
                 finish_reason = map_finish_reason(
                     finish_reason=gemini_chunk["finishReason"]
                 )
@@ -1533,18 +1543,19 @@ class ModelResponseIterator:
             )
 
     def _common_chunk_parsing_logic(self, chunk: str) -> GenericStreamingChunk:
-        chunk = chunk.replace("data:", "")
-        if len(chunk) > 0:
-            """
-            Check if initial chunk valid json
-            - if partial json -> enter accumulated json logic
-            - if valid - continue
-            """
-            if self.chunk_type == "valid_json":
-                return self.handle_valid_json_chunk(chunk=chunk)
-            elif self.chunk_type == "accumulated_json":
-                return self.handle_accumulated_json_chunk(chunk=chunk)
-        else:
+        try:
+            chunk = chunk.replace("data:", "")
+            if len(chunk) > 0:
+                """
+                Check if initial chunk valid json
+                - if partial json -> enter accumulated json logic
+                - if valid - continue
+                """
+                if self.chunk_type == "valid_json":
+                    return self.handle_valid_json_chunk(chunk=chunk)
+                elif self.chunk_type == "accumulated_json":
+                    return self.handle_accumulated_json_chunk(chunk=chunk)
+
             return GenericStreamingChunk(
                 text="",
                 is_finished=False,
@@ -1553,6 +1564,8 @@ class ModelResponseIterator:
                 index=0,
                 tool_use=None,
             )
+        except Exception:
+            raise
 
     def __next__(self):
         try:
diff --git a/litellm/main.py b/litellm/main.py
index 51cab8efd..7bb01f937 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -83,6 +83,7 @@ from .llms import (
 from .llms.AI21 import completion as ai21
 from .llms.anthropic.chat import AnthropicChatCompletion
 from .llms.anthropic.completion import AnthropicTextCompletion
+from .llms.azure_ai.chat.handler import AzureAIChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
 from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
@@ -166,6 +167,7 @@ openai_text_completions = OpenAITextCompletion()
 openai_o1_chat_completions = OpenAIO1ChatCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
+azure_ai_chat_completions = AzureAIChatCompletion()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
@@ -1177,7 +1179,7 @@ def completion(
             headers = headers or litellm.headers
 
             ## LOAD CONFIG - if set
-            config = litellm.OpenAIConfig.get_config()
+            config = litellm.AzureAIStudioConfig.get_config()
             for k, v in config.items():
                 if (
                     k not in optional_params
@@ -1190,7 +1192,7 @@ def completion(
 
             ## COMPLETION CALL
             try:
-                response = openai_chat_completions.completion(
+                response = azure_ai_chat_completions.completion(
                     model=model,
                     messages=messages,
                     headers=headers,
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 9d5c35f3b..5aef89d45 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -3862,9 +3862,9 @@
         "supports_vision": true
     },
     "anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192, 
+        "max_tokens": 4096, 
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3906,9 +3906,9 @@
         "supports_vision": true
     },
     "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3939,9 +3939,9 @@
         "supports_vision": true
     },
     "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3950,9 +3950,9 @@
         "supports_vision": true
     },
     "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -5593,6 +5593,11 @@
         "output_cost_per_token": 0.0000012,
         "litellm_provider": "fireworks_ai"
     },
+    "fireworks-ai-default": {
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "fireworks_ai"
+    },
     "fireworks-ai-embedding-up-to-150m": {
         "input_cost_per_token": 0.000000008,
         "output_cost_per_token": 0.000000,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 4e6a493ba..8cc73b050 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -31,6 +31,13 @@ model_list:
   - model_name: "anthropic/*"
     litellm_params:
       model: "anthropic/*"
+  - model_name: "openai/*"
+    litellm_params:
+      model: "openai/*"
+  - model_name: "fireworks_ai/*"
+    litellm_params:
+      model: "fireworks_ai/*"
+      configurable_clientside_auth_params: ["api_base"]
 
 
 litellm_settings:
diff --git a/litellm/proxy/auth/auth_utils.py b/litellm/proxy/auth/auth_utils.py
index 0c228d93f..d34d5d9ef 100644
--- a/litellm/proxy/auth/auth_utils.py
+++ b/litellm/proxy/auth/auth_utils.py
@@ -5,6 +5,7 @@ from typing import List, Optional, Tuple
 
 from fastapi import HTTPException, Request, status
 
+from litellm import Router, provider_list
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
 
@@ -72,7 +73,41 @@ def check_complete_credentials(request_body: dict) -> bool:
     return False
 
 
-def is_request_body_safe(request_body: dict) -> bool:
+def _allow_model_level_clientside_configurable_parameters(
+    model: str, param: str, llm_router: Optional[Router]
+) -> bool:
+    """
+    Check if model is allowed to use configurable client-side params
+    - get matching model
+    - check if 'clientside_configurable_parameters' is set for model
+    -
+    """
+    if llm_router is None:
+        return False
+    # check if model is set
+    model_info = llm_router.get_model_group_info(model_group=model)
+    if model_info is None:
+        # check if wildcard model is set
+        if model.split("/", 1)[0] in provider_list:
+            model_info = llm_router.get_model_group_info(
+                model_group=model.split("/", 1)[0]
+            )
+
+    if model_info is None:
+        return False
+
+    if model_info is None or model_info.configurable_clientside_auth_params is None:
+        return False
+
+    if param in model_info.configurable_clientside_auth_params:
+        return True
+
+    return False
+
+
+def is_request_body_safe(
+    request_body: dict, general_settings: dict, llm_router: Optional[Router], model: str
+) -> bool:
     """
     Check if the request body is safe.
 
@@ -88,7 +123,20 @@ def is_request_body_safe(request_body: dict) -> bool:
                 request_body=request_body
             )
         ):
-            raise ValueError(f"BadRequest: {param} is not allowed in request body")
+            if general_settings.get("allow_client_side_credentials") is True:
+                return True
+            elif (
+                _allow_model_level_clientside_configurable_parameters(
+                    model=model, param=param, llm_router=llm_router
+                )
+                is True
+            ):
+                return True
+            raise ValueError(
+                f"Rejected Request: {param} is not allowed in request body. "
+                "Enable with `general_settings::allow_client_side_credentials` on proxy config.yaml. "
+                "Relevant Issue: https://huntr.com/bounties/4001e1a2-7b7a-4776-a3ae-e6692ec3d997",
+            )
 
     return True
 
@@ -110,13 +158,20 @@ async def pre_db_read_auth_checks(
     Raises:
     - HTTPException if request fails initial auth checks
     """
-    from litellm.proxy.proxy_server import general_settings, premium_user
+    from litellm.proxy.proxy_server import general_settings, llm_router, premium_user
 
     # Check 1. request size
     await check_if_request_size_is_safe(request=request)
 
     # Check 2. Request body is safe
-    is_request_body_safe(request_body=request_data)
+    is_request_body_safe(
+        request_body=request_data,
+        general_settings=general_settings,
+        llm_router=llm_router,
+        model=request_data.get(
+            "model", ""
+        ),  # [TODO] use model passed in url as well (azure openai routes)
+    )
 
     # Check 3. Check if IP address is allowed
     is_valid_ip, passed_in_ip = _check_valid_ip(
diff --git a/litellm/proxy/route_llm_request.py b/litellm/proxy/route_llm_request.py
index 41da68b51..4d1ac6c15 100644
--- a/litellm/proxy/route_llm_request.py
+++ b/litellm/proxy/route_llm_request.py
@@ -66,7 +66,7 @@ async def route_request(
     """
     router_model_names = llm_router.model_names if llm_router is not None else []
 
-    if "api_key" in data:
+    if "api_key" in data or "api_base" in data:
         return getattr(litellm, f"{route_type}")(**data)
 
     elif "user_config" in data:
diff --git a/litellm/proxy/spend_tracking/spend_tracking_utils.py b/litellm/proxy/spend_tracking/spend_tracking_utils.py
index bdeef92cc..db7c5f8d8 100644
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@@ -4,6 +4,8 @@ import secrets
 import traceback
 from typing import Optional
 
+from pydantic import BaseModel
+
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import SpendLogsMetadata, SpendLogsPayload
@@ -105,6 +107,8 @@ def get_logging_payload(
     additional_usage_values = {}
     for k, v in usage.items():
         if k not in special_usage_fields:
+            if isinstance(v, BaseModel):
+                v = v.model_dump()
             additional_usage_values.update({k: v})
     clean_metadata["additional_usage_values"] = additional_usage_values
 
diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py
index c2948d41e..a861dccbc 100644
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@@ -14,7 +14,17 @@ from datetime import datetime, timedelta
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from functools import wraps
-from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Union,
+    get_args,
+    overload,
+)
 
 import backoff
 import httpx
@@ -222,19 +232,7 @@ class ProxyLogging:
         self.cache_control_check = _PROXY_CacheControlCheck()
         self.alerting: Optional[List] = None
         self.alerting_threshold: float = 300  # default to 5 min. threshold
-        self.alert_types: List[AlertType] = [
-            "llm_exceptions",
-            "llm_too_slow",
-            "llm_requests_hanging",
-            "budget_alerts",
-            "db_exceptions",
-            "daily_reports",
-            "spend_reports",
-            "fallback_reports",
-            "cooldown_deployment",
-            "new_model_added",
-            "outage_alerts",
-        ]
+        self.alert_types: List[AlertType] = list(get_args(AlertType))
         self.alert_to_webhook_url: Optional[dict] = None
         self.slack_alerting_instance: SlackAlerting = SlackAlerting(
             alerting_threshold=self.alerting_threshold,
diff --git a/litellm/router.py b/litellm/router.py
index 47b2c8b15..b3a07ad4e 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -4335,157 +4335,177 @@ class Router:
 
         total_tpm: Optional[int] = None
         total_rpm: Optional[int] = None
+        configurable_clientside_auth_params: Optional[List[str]] = None
 
         for model in self.model_list:
-            if "model_name" in model and model["model_name"] == model_group:
-                # model in model group found #
-                litellm_params = LiteLLM_Params(**model["litellm_params"])
-                # get model tpm
-                _deployment_tpm: Optional[int] = None
-                if _deployment_tpm is None:
-                    _deployment_tpm = model.get("tpm", None)
-                if _deployment_tpm is None:
-                    _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)
-                if _deployment_tpm is None:
-                    _deployment_tpm = model.get("model_info", {}).get("tpm", None)
+            is_match = False
+            if (
+                "model_name" in model and model["model_name"] == model_group
+            ):  # exact match
+                is_match = True
+            elif (
+                "model_name" in model
+                and model_group in self.provider_default_deployments
+            ):  # wildcard model
+                is_match = True
 
-                if _deployment_tpm is not None:
-                    if total_tpm is None:
-                        total_tpm = 0
-                    total_tpm += _deployment_tpm  # type: ignore
-                # get model rpm
-                _deployment_rpm: Optional[int] = None
-                if _deployment_rpm is None:
-                    _deployment_rpm = model.get("rpm", None)
-                if _deployment_rpm is None:
-                    _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)
-                if _deployment_rpm is None:
-                    _deployment_rpm = model.get("model_info", {}).get("rpm", None)
+            if not is_match:
+                continue
+            # model in model group found #
+            litellm_params = LiteLLM_Params(**model["litellm_params"])
+            # get configurable clientside auth params
+            configurable_clientside_auth_params = (
+                litellm_params.configurable_clientside_auth_params
+            )
+            # get model tpm
+            _deployment_tpm: Optional[int] = None
+            if _deployment_tpm is None:
+                _deployment_tpm = model.get("tpm", None)
+            if _deployment_tpm is None:
+                _deployment_tpm = model.get("litellm_params", {}).get("tpm", None)
+            if _deployment_tpm is None:
+                _deployment_tpm = model.get("model_info", {}).get("tpm", None)
 
-                if _deployment_rpm is not None:
-                    if total_rpm is None:
-                        total_rpm = 0
-                    total_rpm += _deployment_rpm  # type: ignore
-                # get model info
-                try:
-                    model_info = litellm.get_model_info(model=litellm_params.model)
-                except Exception:
-                    model_info = None
-                # get llm provider
-                model, llm_provider = "", ""
-                try:
-                    model, llm_provider, _, _ = litellm.get_llm_provider(
-                        model=litellm_params.model,
-                        custom_llm_provider=litellm_params.custom_llm_provider,
+            if _deployment_tpm is not None:
+                if total_tpm is None:
+                    total_tpm = 0
+                total_tpm += _deployment_tpm  # type: ignore
+            # get model rpm
+            _deployment_rpm: Optional[int] = None
+            if _deployment_rpm is None:
+                _deployment_rpm = model.get("rpm", None)
+            if _deployment_rpm is None:
+                _deployment_rpm = model.get("litellm_params", {}).get("rpm", None)
+            if _deployment_rpm is None:
+                _deployment_rpm = model.get("model_info", {}).get("rpm", None)
+
+            if _deployment_rpm is not None:
+                if total_rpm is None:
+                    total_rpm = 0
+                total_rpm += _deployment_rpm  # type: ignore
+            # get model info
+            try:
+                model_info = litellm.get_model_info(model=litellm_params.model)
+            except Exception:
+                model_info = None
+            # get llm provider
+            model, llm_provider = "", ""
+            try:
+                model, llm_provider, _, _ = litellm.get_llm_provider(
+                    model=litellm_params.model,
+                    custom_llm_provider=litellm_params.custom_llm_provider,
+                )
+            except litellm.exceptions.BadRequestError as e:
+                verbose_router_logger.error(
+                    "litellm.router.py::get_model_group_info() - {}".format(str(e))
+                )
+
+            if model_info is None:
+                supported_openai_params = litellm.get_supported_openai_params(
+                    model=model, custom_llm_provider=llm_provider
+                )
+                if supported_openai_params is None:
+                    supported_openai_params = []
+                model_info = ModelMapInfo(
+                    key=model_group,
+                    max_tokens=None,
+                    max_input_tokens=None,
+                    max_output_tokens=None,
+                    input_cost_per_token=0,
+                    output_cost_per_token=0,
+                    litellm_provider=llm_provider,
+                    mode="chat",
+                    supported_openai_params=supported_openai_params,
+                    supports_system_messages=None,
+                )
+
+            if model_group_info is None:
+                model_group_info = ModelGroupInfo(
+                    model_group=user_facing_model_group_name, providers=[llm_provider], **model_info  # type: ignore
+                )
+            else:
+                # if max_input_tokens > curr
+                # if max_output_tokens > curr
+                # if input_cost_per_token > curr
+                # if output_cost_per_token > curr
+                # supports_parallel_function_calling == True
+                # supports_vision == True
+                # supports_function_calling == True
+                if llm_provider not in model_group_info.providers:
+                    model_group_info.providers.append(llm_provider)
+                if (
+                    model_info.get("max_input_tokens", None) is not None
+                    and model_info["max_input_tokens"] is not None
+                    and (
+                        model_group_info.max_input_tokens is None
+                        or model_info["max_input_tokens"]
+                        > model_group_info.max_input_tokens
                     )
-                except litellm.exceptions.BadRequestError as e:
-                    verbose_router_logger.error(
-                        "litellm.router.py::get_model_group_info() - {}".format(str(e))
+                ):
+                    model_group_info.max_input_tokens = model_info["max_input_tokens"]
+                if (
+                    model_info.get("max_output_tokens", None) is not None
+                    and model_info["max_output_tokens"] is not None
+                    and (
+                        model_group_info.max_output_tokens is None
+                        or model_info["max_output_tokens"]
+                        > model_group_info.max_output_tokens
                     )
+                ):
+                    model_group_info.max_output_tokens = model_info["max_output_tokens"]
+                if model_info.get("input_cost_per_token", None) is not None and (
+                    model_group_info.input_cost_per_token is None
+                    or model_info["input_cost_per_token"]
+                    > model_group_info.input_cost_per_token
+                ):
+                    model_group_info.input_cost_per_token = model_info[
+                        "input_cost_per_token"
+                    ]
+                if model_info.get("output_cost_per_token", None) is not None and (
+                    model_group_info.output_cost_per_token is None
+                    or model_info["output_cost_per_token"]
+                    > model_group_info.output_cost_per_token
+                ):
+                    model_group_info.output_cost_per_token = model_info[
+                        "output_cost_per_token"
+                    ]
+                if (
+                    model_info.get("supports_parallel_function_calling", None)
+                    is not None
+                    and model_info["supports_parallel_function_calling"] is True  # type: ignore
+                ):
+                    model_group_info.supports_parallel_function_calling = True
+                if (
+                    model_info.get("supports_vision", None) is not None
+                    and model_info["supports_vision"] is True  # type: ignore
+                ):
+                    model_group_info.supports_vision = True
+                if (
+                    model_info.get("supports_function_calling", None) is not None
+                    and model_info["supports_function_calling"] is True  # type: ignore
+                ):
+                    model_group_info.supports_function_calling = True
+                if (
+                    model_info.get("supported_openai_params", None) is not None
+                    and model_info["supported_openai_params"] is not None
+                ):
+                    model_group_info.supported_openai_params = model_info[
+                        "supported_openai_params"
+                    ]
 
-                if model_info is None:
-                    supported_openai_params = litellm.get_supported_openai_params(
-                        model=model, custom_llm_provider=llm_provider
-                    )
-                    if supported_openai_params is None:
-                        supported_openai_params = []
-                    model_info = ModelMapInfo(
-                        key=model_group,
-                        max_tokens=None,
-                        max_input_tokens=None,
-                        max_output_tokens=None,
-                        input_cost_per_token=0,
-                        output_cost_per_token=0,
-                        litellm_provider=llm_provider,
-                        mode="chat",
-                        supported_openai_params=supported_openai_params,
-                        supports_system_messages=None,
-                    )
+        if model_group_info is not None:
+            ## UPDATE WITH TOTAL TPM/RPM FOR MODEL GROUP
+            if total_tpm is not None:
+                model_group_info.tpm = total_tpm
 
-                if model_group_info is None:
-                    model_group_info = ModelGroupInfo(
-                        model_group=user_facing_model_group_name, providers=[llm_provider], **model_info  # type: ignore
-                    )
-                else:
-                    # if max_input_tokens > curr
-                    # if max_output_tokens > curr
-                    # if input_cost_per_token > curr
-                    # if output_cost_per_token > curr
-                    # supports_parallel_function_calling == True
-                    # supports_vision == True
-                    # supports_function_calling == True
-                    if llm_provider not in model_group_info.providers:
-                        model_group_info.providers.append(llm_provider)
-                    if (
-                        model_info.get("max_input_tokens", None) is not None
-                        and model_info["max_input_tokens"] is not None
-                        and (
-                            model_group_info.max_input_tokens is None
-                            or model_info["max_input_tokens"]
-                            > model_group_info.max_input_tokens
-                        )
-                    ):
-                        model_group_info.max_input_tokens = model_info[
-                            "max_input_tokens"
-                        ]
-                    if (
-                        model_info.get("max_output_tokens", None) is not None
-                        and model_info["max_output_tokens"] is not None
-                        and (
-                            model_group_info.max_output_tokens is None
-                            or model_info["max_output_tokens"]
-                            > model_group_info.max_output_tokens
-                        )
-                    ):
-                        model_group_info.max_output_tokens = model_info[
-                            "max_output_tokens"
-                        ]
-                    if model_info.get("input_cost_per_token", None) is not None and (
-                        model_group_info.input_cost_per_token is None
-                        or model_info["input_cost_per_token"]
-                        > model_group_info.input_cost_per_token
-                    ):
-                        model_group_info.input_cost_per_token = model_info[
-                            "input_cost_per_token"
-                        ]
-                    if model_info.get("output_cost_per_token", None) is not None and (
-                        model_group_info.output_cost_per_token is None
-                        or model_info["output_cost_per_token"]
-                        > model_group_info.output_cost_per_token
-                    ):
-                        model_group_info.output_cost_per_token = model_info[
-                            "output_cost_per_token"
-                        ]
-                    if (
-                        model_info.get("supports_parallel_function_calling", None)
-                        is not None
-                        and model_info["supports_parallel_function_calling"] is True  # type: ignore
-                    ):
-                        model_group_info.supports_parallel_function_calling = True
-                    if (
-                        model_info.get("supports_vision", None) is not None
-                        and model_info["supports_vision"] is True  # type: ignore
-                    ):
-                        model_group_info.supports_vision = True
-                    if (
-                        model_info.get("supports_function_calling", None) is not None
-                        and model_info["supports_function_calling"] is True  # type: ignore
-                    ):
-                        model_group_info.supports_function_calling = True
-                    if (
-                        model_info.get("supported_openai_params", None) is not None
-                        and model_info["supported_openai_params"] is not None
-                    ):
-                        model_group_info.supported_openai_params = model_info[
-                            "supported_openai_params"
-                        ]
+            if total_rpm is not None:
+                model_group_info.rpm = total_rpm
 
-        ## UPDATE WITH TOTAL TPM/RPM FOR MODEL GROUP
-        if total_tpm is not None and model_group_info is not None:
-            model_group_info.tpm = total_tpm
-
-        if total_rpm is not None and model_group_info is not None:
-            model_group_info.rpm = total_rpm
+            ## UPDATE WITH CONFIGURABLE CLIENTSIDE AUTH PARAMS FOR MODEL GROUP
+            if configurable_clientside_auth_params is not None:
+                model_group_info.configurable_clientside_auth_params = (
+                    configurable_clientside_auth_params
+                )
 
         return model_group_info
 
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index ea2a95d8a..a51dcc693 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -141,9 +141,16 @@ def test_completion_azure_ai_command_r():
         os.environ["AZURE_AI_API_BASE"] = os.getenv("AZURE_COHERE_API_BASE", "")
         os.environ["AZURE_AI_API_KEY"] = os.getenv("AZURE_COHERE_API_KEY", "")
 
-        response: litellm.ModelResponse = completion(
+        response = completion(
             model="azure_ai/command-r-plus",
-            messages=[{"role": "user", "content": "What is the meaning of life?"}],
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is the meaning of life?"}
+                    ],
+                }
+            ],
         )  # type: ignore
 
         assert "azure_ai" in response.model
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index d2ffaa4c9..259d6cccb 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -1257,14 +1257,31 @@ def test_completion_cost_databricks_embedding(model):
     cost = completion_cost(completion_response=resp)
 
 
-def test_completion_cost_fireworks_ai():
+from litellm.llms.fireworks_ai.cost_calculator import get_base_model_for_pricing
+
+
+@pytest.mark.parametrize(
+    "model, base_model",
+    [
+        ("fireworks_ai/llama-v3p1-405b-instruct", "fireworks-ai-default"),
+        ("fireworks_ai/mixtral-8x7b-instruct", "fireworks-ai-moe-up-to-56b"),
+    ],
+)
+def test_get_model_params_fireworks_ai(model, base_model):
+    pricing_model = get_base_model_for_pricing(model_name=model)
+    assert base_model == pricing_model
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["fireworks_ai/llama-v3p1-405b-instruct", "fireworks_ai/mixtral-8x7b-instruct"],
+)
+def test_completion_cost_fireworks_ai(model):
     os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
     litellm.model_cost = litellm.get_model_cost_map(url="")
 
     messages = [{"role": "user", "content": "Hey, how's it going?"}]
-    resp = litellm.completion(
-        model="fireworks_ai/mixtral-8x7b-instruct", messages=messages
-    )  # works fine
+    resp = litellm.completion(model=model, messages=messages)  # works fine
 
     print(resp)
     cost = completion_cost(completion_response=resp)
diff --git a/litellm/tests/test_proxy_utils.py b/litellm/tests/test_proxy_utils.py
index b5aac09d1..43c0de51f 100644
--- a/litellm/tests/test_proxy_utils.py
+++ b/litellm/tests/test_proxy_utils.py
@@ -12,6 +12,7 @@ sys.path.insert(
     0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from litellm.proxy._types import LitellmUserRoles, UserAPIKeyAuth
+from litellm.proxy.auth.auth_utils import is_request_body_safe
 from litellm.proxy.litellm_pre_call_utils import (
     _get_dynamic_logging_metadata,
     add_litellm_data_to_request,
@@ -291,3 +292,78 @@ def test_dynamic_logging_metadata_key_and_team_metadata(callback_vars):
 
     for var in callbacks.callback_vars.values():
         assert "os.environ" not in var
+
+
+@pytest.mark.parametrize(
+    "allow_client_side_credentials, expect_error", [(True, False), (False, True)]
+)
+def test_is_request_body_safe_global_enabled(
+    allow_client_side_credentials, expect_error
+):
+    from litellm import Router
+
+    error_raised = False
+
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",
+                "litellm_params": {
+                    "model": "gpt-3.5-turbo",
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                },
+            }
+        ]
+    )
+    try:
+        is_request_body_safe(
+            request_body={"api_base": "hello-world"},
+            general_settings={
+                "allow_client_side_credentials": allow_client_side_credentials
+            },
+            llm_router=llm_router,
+            model="gpt-3.5-turbo",
+        )
+    except Exception as e:
+        print(e)
+        error_raised = True
+
+    assert expect_error == error_raised
+
+
+@pytest.mark.parametrize(
+    "allow_client_side_credentials, expect_error", [(True, False), (False, True)]
+)
+def test_is_request_body_safe_model_enabled(
+    allow_client_side_credentials, expect_error
+):
+    from litellm import Router
+
+    error_raised = False
+
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "fireworks_ai/*",
+                "litellm_params": {
+                    "model": "fireworks_ai/*",
+                    "api_key": os.getenv("FIREWORKS_API_KEY"),
+                    "configurable_clientside_auth_params": (
+                        ["api_base"] if allow_client_side_credentials else []
+                    ),
+                },
+            }
+        ]
+    )
+    try:
+        is_request_body_safe(
+            request_body={"api_base": "hello-world"},
+            general_settings={},
+            llm_router=llm_router,
+            model="fireworks_ai/my-new-model",
+        )
+    except Exception as e:
+        print(e)
+        error_raised = True
+
+    assert expect_error == error_raised
diff --git a/litellm/types/llms/vertex_ai.py b/litellm/types/llms/vertex_ai.py
index e502f44c3..0637933e2 100644
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@@ -283,7 +283,7 @@ class PromptFeedback(TypedDict):
 
 
 class GenerateContentResponseBody(TypedDict, total=False):
-    candidates: Required[List[Candidates]]
+    candidates: List[Candidates]
     promptFeedback: PromptFeedback
     usageMetadata: Required[UsageMetadata]
 
diff --git a/litellm/types/router.py b/litellm/types/router.py
index 304e6fd43..306dfcba1 100644
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@@ -139,6 +139,7 @@ class GenericLiteLLMParams(BaseModel):
     )
     max_retries: Optional[int] = None
     organization: Optional[str] = None  # for openai orgs
+    configurable_clientside_auth_params: Optional[List[str]] = None
     ## UNIFIED PROJECT/REGION ##
     region_name: Optional[str] = None
     ## VERTEX AI ##
@@ -310,6 +311,9 @@ class LiteLLMParamsTypedDict(TypedDict, total=False):
     stream_timeout: Optional[Union[float, str]]
     max_retries: Optional[int]
     organization: Optional[Union[List, str]]  # for openai orgs
+    configurable_clientside_auth_params: Optional[
+        List[str]
+    ]  # for allowing api base switching on finetuned models
     ## DROP PARAMS ##
     drop_params: Optional[bool]
     ## UNIFIED PROJECT/REGION ##
@@ -487,6 +491,7 @@ class ModelGroupInfo(BaseModel):
     supports_vision: bool = Field(default=False)
     supports_function_calling: bool = Field(default=False)
     supported_openai_params: Optional[List[str]] = Field(default=[])
+    configurable_clientside_auth_params: Optional[List[str]] = None
 
 
 class AssistantsTypedDict(TypedDict):
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 9f8c8730b..618f9bf47 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -1196,6 +1196,7 @@ all_litellm_params = [
     "client_id",
     "client_secret",
     "user_continue_message",
+    "configurable_clientside_auth_params",
 ]
 
 
@@ -1323,7 +1324,7 @@ class StandardLoggingPayload(TypedDict):
     metadata: StandardLoggingMetadata
     cache_hit: Optional[bool]
     cache_key: Optional[str]
-    saved_cache_cost: Optional[float]
+    saved_cache_cost: float
     request_tags: list
     end_user: Optional[str]
     requester_ip_address: Optional[str]
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 9d5c35f3b..5aef89d45 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -3862,9 +3862,9 @@
         "supports_vision": true
     },
     "anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192, 
+        "max_tokens": 4096, 
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3906,9 +3906,9 @@
         "supports_vision": true
     },
     "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3939,9 +3939,9 @@
         "supports_vision": true
     },
     "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -3950,9 +3950,9 @@
         "supports_vision": true
     },
     "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
         "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
         "litellm_provider": "bedrock",
@@ -5593,6 +5593,11 @@
         "output_cost_per_token": 0.0000012,
         "litellm_provider": "fireworks_ai"
     },
+    "fireworks-ai-default": {
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "fireworks_ai"
+    },
     "fireworks-ai-embedding-up-to-150m": {
         "input_cost_per_token": 0.000000008,
         "output_cost_per_token": 0.000000,