LiteLLM Minor Fixes & Improvements (09/23/2024) (#5842) (#5858)

* LiteLLM Minor Fixes & Improvements (09/23/2024) (#5842) * feat(auth_utils.py): enable admin to allow client-side credentials to be passed Makes it easier for devs to experiment with finetuned fireworks ai models * feat(router.py): allow setting configurable_clientside_auth_params for a model Closes https://github.com/BerriAI/litellm/issues/5843 * build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit Fixes https://github.com/BerriAI/litellm/issues/5850 * fix(azure_ai/): support content list for azure ai Fixes https://github.com/BerriAI/litellm/issues/4237 * fix(litellm_logging.py): always set saved_cache_cost Set to 0 by default * fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing handles calling 405b+ size models * fix(slack_alerting.py): fix error alerting for failed spend tracking Fixes regression with slack alerting error monitoring * fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error * docs(bedrock.md): add llama3-1 models * test: fix tests * fix(azure_ai/chat): fix transformation for azure ai calls
2024-09-24 15:01:31 -07:00 · 2024-09-24 15:01:31 -07:00 · d37c8b5c6b
commit d37c8b5c6b
parent 4df9aca45e
25 changed files with 611 additions and 294 deletions
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -987,6 +987,9 @@ Here's an example of using a bedrock model with LiteLLM. For a complete list, re
 | Anthropic Claude-V2.1      | `completion(model='bedrock/anthropic.claude-v2:1', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V2        | `completion(model='bedrock/anthropic.claude-v2', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-Instant V1 | `completion(model='bedrock/anthropic.claude-instant-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-1-405b        | `completion(model='bedrock/meta.llama3-1-405b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-1-70b        | `completion(model='bedrock/meta.llama3-1-70b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-1-8b        | `completion(model='bedrock/meta.llama3-1-8b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-70b        | `completion(model='bedrock/meta.llama3-70b-instruct-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Meta llama3-8b | `completion(model='bedrock/meta.llama3-8b-instruct-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Amazon Titan Lite          | `completion(model='bedrock/amazon.titan-text-lite-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` |
--- a/litellm/init.py
+++ b/litellm/init.py
@ -963,8 +963,8 @@ from .llms.OpenAI.openai import (
    MistralEmbeddingConfig,
    DeepInfraConfig,
    GroqConfig,
    AzureAIStudioConfig,
 )
 from .llms.azure_ai.chat.transformation import AzureAIStudioConfig
 from .llms.mistral.mistral_chat_transformation import MistralConfig
 from .llms.OpenAI.chat.o1_transformation import (
    OpenAIO1Config,
--- a/litellm/integrations/SlackAlerting/slack_alerting.py
+++ b/litellm/integrations/SlackAlerting/slack_alerting.py
@ -10,7 +10,7 @@ import traceback
 from datetime import datetime as dt
 from datetime import timedelta, timezone
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Set, TypedDict, Union
+from typing import Any, Dict, List, Literal, Optional, Set, TypedDict, Union, get_args
 import aiohttp
 import dotenv
@ -57,20 +57,7 @@ class SlackAlerting(CustomBatchLogger):
            float
        ] = None,  # threshold for slow / hanging llm responses (in seconds)
        alerting: Optional[List] = [],
-        alert_types: List[AlertType] = [
+        alert_types: List[AlertType] = list(get_args(AlertType)),
            "llm_exceptions",
            "llm_too_slow",
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
            "daily_reports",
            "spend_reports",
            "fallback_reports",
            "cooldown_deployment",
            "new_model_added",
            "outage_alerts",
            "failed_tracking_spend",
        ],
        alert_to_webhook_url: Optional[
            Dict[AlertType, Union[List[str], str]]
        ] = None,  # if user wants to separate alerts to diff channels
@ -613,7 +600,7 @@ class SlackAlerting(CustomBatchLogger):
            await self.send_alert(
                message=message,
                level="High",
-                alert_type="budget_alerts",
+                alert_type="failed_tracking_spend",
                alerting_metadata={},
            )
            await _cache.async_set_cache(
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -2498,14 +2498,17 @@ def get_standard_logging_object_payload(
        else:
            cache_key = None
-        saved_cache_cost: Optional[float] = None
+        saved_cache_cost: float = 0.0
        if cache_hit is True:
            id = f"{id}_cache_hit{time.time()}"  # do not duplicate the request id
-            saved_cache_cost = logging_obj._response_cost_calculator(
+            saved_cache_cost = (
                logging_obj._response_cost_calculator(
                    result=init_response_obj, cache_hit=False  # type: ignore
                )
                or 0.0
            )
        ## Get model cost information ##
        base_model = _get_base_model_from_metadata(model_call_details=kwargs)
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@ -103,25 +103,6 @@ class MistralEmbeddingConfig:
        return optional_params
 class AzureAIStudioConfig:
    def get_required_params(self) -> List[ProviderField]:
        """For a given provider, return it's required fields with a description"""
        return [
            ProviderField(
                field_name="api_key",
                field_type="string",
                field_description="Your Azure AI Studio API Key.",
                field_value="zEJ...",
            ),
            ProviderField(
                field_name="api_base",
                field_type="string",
                field_description="Your Azure AI Studio API Base.",
                field_value="https://Mistral-serverless.",
            ),
        ]
 class DeepInfraConfig:
    """
    Reference: https://deepinfra.com/docs/advanced/openai_api
--- a/litellm/llms/azure_ai/chat/handler.py
+++ b/litellm/llms/azure_ai/chat/handler.py
@ -0,0 +1,59 @@
 from typing import Any, Callable, List, Optional, Union
 from httpx._config import Timeout
 from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
 from litellm.llms.OpenAI.openai import OpenAIChatCompletion
 from litellm.types.utils import ModelResponse
 from litellm.utils import CustomStreamWrapper
 from .transformation import AzureAIStudioConfig
 class AzureAIChatCompletion(OpenAIChatCompletion):
    def completion(
        self,
        model_response: ModelResponse,
        timeout: Union[float, Timeout],
        optional_params: dict,
        logging_obj: Any,
        model: Optional[str] = None,
        messages: Optional[list] = None,
        print_verbose: Optional[Callable[..., Any]] = None,
        api_key: Optional[str] = None,
        api_base: Optional[str] = None,
        acompletion: bool = False,
        litellm_params=None,
        logger_fn=None,
        headers: Optional[dict] = None,
        custom_prompt_dict: dict = {},
        client=None,
        organization: Optional[str] = None,
        custom_llm_provider: Optional[str] = None,
        drop_params: Optional[bool] = None,
    ):
        transformed_messages = AzureAIStudioConfig()._transform_messages(
            messages=messages  # type: ignore
        )
        return super().completion(
            model_response,
            timeout,
            optional_params,
            logging_obj,
            model,
            transformed_messages,
            print_verbose,
            api_key,
            api_base,
            acompletion,
            litellm_params,
            logger_fn,
            headers,
            custom_prompt_dict,
            client,
            organization,
            custom_llm_provider,
            drop_params,
        )
--- a/litellm/llms/azure_ai/chat/transformation.py
+++ b/litellm/llms/azure_ai/chat/transformation.py
@ -0,0 +1,31 @@
 from typing import List
 from litellm.llms.OpenAI.openai import OpenAIConfig
 from litellm.llms.prompt_templates.common_utils import convert_content_list_to_str
 from litellm.types.llms.openai import AllMessageValues
 from litellm.types.utils import ProviderField
 class AzureAIStudioConfig(OpenAIConfig):
    def get_required_params(self) -> List[ProviderField]:
        """For a given provider, return it's required fields with a description"""
        return [
            ProviderField(
                field_name="api_key",
                field_type="string",
                field_description="Your Azure AI Studio API Key.",
                field_value="zEJ...",
            ),
            ProviderField(
                field_name="api_base",
                field_type="string",
                field_description="Your Azure AI Studio API Base.",
                field_value="https://Mistral-serverless.",
            ),
        ]
    def _transform_messages(self, messages: List[AllMessageValues]) -> List:
        for message in messages:
            message = convert_content_list_to_str(message=message)
        return messages
--- a/litellm/llms/fireworks_ai/cost_calculator.py
+++ b/litellm/llms/fireworks_ai/cost_calculator.py
@ -10,7 +10,7 @@ from litellm.utils import get_model_info
 # Extract the number of billion parameters from the model name
 # only used for together_computer LLMs
-def get_model_params_and_category(model_name: str) -> str:
+def get_base_model_for_pricing(model_name: str) -> str:
    """
    Helper function for calculating together ai pricing.
@ -43,7 +43,7 @@ def get_model_params_and_category(model_name: str) -> str:
            return "fireworks-ai-16b-80b"
    # If no matches, return the original model_name
-    return model_name
+    return "fireworks-ai-default"
 def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
@ -57,10 +57,16 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
    Returns:
        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
    """
-    base_model = get_model_params_and_category(model_name=model)
+    ## check if model mapped, else use default pricing
    try:
        model_info = get_model_info(model=model, custom_llm_provider="fireworks_ai")
    except Exception:
        base_model = get_base_model_for_pricing(model_name=model)
        ## GET MODEL INFO
-    model_info = get_model_info(model=base_model, custom_llm_provider="fireworks_ai")
+        model_info = get_model_info(
            model=base_model, custom_llm_provider="fireworks_ai"
        )
    ## CALCULATE INPUT COST
--- a/litellm/llms/prompt_templates/common_utils.py
+++ b/litellm/llms/prompt_templates/common_utils.py
@ -0,0 +1,32 @@
 """
 Common utility functions used for translating messages across providers
 """
 from typing import List
 from litellm.types.llms.openai import AllMessageValues
 def convert_content_list_to_str(message: AllMessageValues) -> AllMessageValues:
    """
    - handles scenario where content is list and not string
    - content list is just text, and no images
    - if image passed in, then just return as is (user-intended)
    Motivation: mistral api + azure ai don't support content as a list
    """
    texts = ""
    message_content = message.get("content")
    if message_content:
        if message_content is not None and isinstance(message_content, list):
            for c in message_content:
                text_content = c.get("text")
                if text_content:
                    texts += text_content
        elif message_content is not None and isinstance(message_content, str):
            texts = message_content
    if texts:
        message["content"] = texts
    return message
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
@ -49,6 +49,7 @@ from litellm.types.llms.openai import (
    ChatCompletionUsageBlock,
 )
 from litellm.types.llms.vertex_ai import (
    Candidates,
    ContentType,
    FunctionCallingConfig,
    FunctionDeclaration,
@ -187,7 +188,11 @@ class VertexAIConfig:
                    optional_params["stop_sequences"] = value
            if param == "max_tokens" or param == "max_completion_tokens":
                optional_params["max_output_tokens"] = value
-            if param == "response_format" and value["type"] == "json_object":
+            if (
                param == "response_format"
                and isinstance(value, dict)
                and value["type"] == "json_object"
            ):
                optional_params["response_mime_type"] = "application/json"
            if param == "frequency_penalty":
                optional_params["frequency_penalty"] = value
@ -900,14 +905,14 @@ class VertexLLM(VertexBase):
                return model_response
-        if len(completion_response["candidates"]) > 0:
+        _candidates = completion_response.get("candidates")
        if _candidates and len(_candidates) > 0:
            content_policy_violations = (
                VertexGeminiConfig().get_flagged_finish_reasons()
            )
            if (
-                "finishReason" in completion_response["candidates"][0]
+                "finishReason" in _candidates[0]
-                and completion_response["candidates"][0]["finishReason"]
+                and _candidates[0]["finishReason"] in content_policy_violations.keys()
                in content_policy_violations.keys()
            ):
                ## CONTENT POLICY VIOLATION ERROR
                model_response.choices[0].finish_reason = "content_filter"
@ -956,12 +961,13 @@ class VertexLLM(VertexBase):
            content_str = ""
            tools: List[ChatCompletionToolCallChunk] = []
            functions: Optional[ChatCompletionToolCallFunctionChunk] = None
-            for idx, candidate in enumerate(completion_response["candidates"]):
+            if _candidates:
                for idx, candidate in enumerate(_candidates):
                    if "content" not in candidate:
                        continue
                    if "groundingMetadata" in candidate:
-                    grounding_metadata.append(candidate["groundingMetadata"])
+                        grounding_metadata.append(candidate["groundingMetadata"])  # type: ignore
                    if "safetyRatings" in candidate:
                        safety_ratings.append(candidate["safetyRatings"])
@ -973,7 +979,9 @@ class VertexLLM(VertexBase):
                    if "functionCall" in candidate["content"]["parts"][0]:
                        _function_chunk = ChatCompletionToolCallFunctionChunk(
-                        name=candidate["content"]["parts"][0]["functionCall"]["name"],
+                            name=candidate["content"]["parts"][0]["functionCall"][
                                "name"
                            ],
                            arguments=json.dumps(
                                candidate["content"]["parts"][0]["functionCall"]["args"]
                            ),
@ -1433,10 +1441,12 @@ class ModelResponseIterator:
            is_finished = False
            finish_reason = ""
            usage: Optional[ChatCompletionUsageBlock] = None
            _candidates: Optional[List[Candidates]] = processed_chunk.get("candidates")
            gemini_chunk: Optional[Candidates] = None
            if _candidates and len(_candidates) > 0:
                gemini_chunk = _candidates[0]
-            gemini_chunk = processed_chunk["candidates"][0]
+            if gemini_chunk and "content" in gemini_chunk:
            if "content" in gemini_chunk:
                if "text" in gemini_chunk["content"]["parts"][0]:
                    text = gemini_chunk["content"]["parts"][0]["text"]
                elif "functionCall" in gemini_chunk["content"]["parts"][0]:
@ -1455,7 +1465,7 @@ class ModelResponseIterator:
                        index=0,
                    )
-            if "finishReason" in gemini_chunk:
+            if gemini_chunk and "finishReason" in gemini_chunk:
                finish_reason = map_finish_reason(
                    finish_reason=gemini_chunk["finishReason"]
                )
@ -1533,6 +1543,7 @@ class ModelResponseIterator:
            )
    def _common_chunk_parsing_logic(self, chunk: str) -> GenericStreamingChunk:
        try:
            chunk = chunk.replace("data:", "")
            if len(chunk) > 0:
                """
@ -1544,7 +1555,7 @@ class ModelResponseIterator:
                    return self.handle_valid_json_chunk(chunk=chunk)
                elif self.chunk_type == "accumulated_json":
                    return self.handle_accumulated_json_chunk(chunk=chunk)
-        else:
+
            return GenericStreamingChunk(
                text="",
                is_finished=False,
@ -1553,6 +1564,8 @@ class ModelResponseIterator:
                index=0,
                tool_use=None,
            )
        except Exception:
            raise
    def __next__(self):
        try:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -83,6 +83,7 @@ from .llms import (
 from .llms.AI21 import completion as ai21
 from .llms.anthropic.chat import AnthropicChatCompletion
 from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure_ai.chat.handler import AzureAIChatCompletion
 from .llms.azure_text import AzureTextCompletion
 from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
 from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
@ -166,6 +167,7 @@ openai_text_completions = OpenAITextCompletion()
 openai_o1_chat_completions = OpenAIO1ChatCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
 azure_ai_chat_completions = AzureAIChatCompletion()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
@ -1177,7 +1179,7 @@ def completion(
            headers = headers or litellm.headers
            ## LOAD CONFIG - if set
-            config = litellm.OpenAIConfig.get_config()
+            config = litellm.AzureAIStudioConfig.get_config()
            for k, v in config.items():
                if (
                    k not in optional_params
@ -1190,7 +1192,7 @@ def completion(
            ## COMPLETION CALL
            try:
-                response = openai_chat_completions.completion(
+                response = azure_ai_chat_completions.completion(
                    model=model,
                    messages=messages,
                    headers=headers,
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -3862,9 +3862,9 @@
        "supports_vision": true
    },
    "anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192, 
+        "max_tokens": 4096, 
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3906,9 +3906,9 @@
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3939,9 +3939,9 @@
        "supports_vision": true
    },
    "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3950,9 +3950,9 @@
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -5593,6 +5593,11 @@
        "output_cost_per_token": 0.0000012,
        "litellm_provider": "fireworks_ai"
    },
    "fireworks-ai-default": {
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "fireworks_ai"
    },
    "fireworks-ai-embedding-up-to-150m": {
        "input_cost_per_token": 0.000000008,
        "output_cost_per_token": 0.000000,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -31,6 +31,13 @@ model_list:
  - model_name: "anthropic/*"
    litellm_params:
      model: "anthropic/*"
  - model_name: "openai/*"
    litellm_params:
      model: "openai/*"
  - model_name: "fireworks_ai/*"
    litellm_params:
      model: "fireworks_ai/*"
      configurable_clientside_auth_params: ["api_base"]
 litellm_settings:
--- a/litellm/proxy/auth/auth_utils.py
+++ b/litellm/proxy/auth/auth_utils.py
@ -5,6 +5,7 @@ from typing import List, Optional, Tuple
 from fastapi import HTTPException, Request, status
 from litellm import Router, provider_list
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import *
@ -72,7 +73,41 @@ def check_complete_credentials(request_body: dict) -> bool:
    return False
-def is_request_body_safe(request_body: dict) -> bool:
+def _allow_model_level_clientside_configurable_parameters(
    model: str, param: str, llm_router: Optional[Router]
 ) -> bool:
    """
    Check if model is allowed to use configurable client-side params
    - get matching model
    - check if 'clientside_configurable_parameters' is set for model
    -
    """
    if llm_router is None:
        return False
    # check if model is set
    model_info = llm_router.get_model_group_info(model_group=model)
    if model_info is None:
        # check if wildcard model is set
        if model.split("/", 1)[0] in provider_list:
            model_info = llm_router.get_model_group_info(
                model_group=model.split("/", 1)[0]
            )
    if model_info is None:
        return False
    if model_info is None or model_info.configurable_clientside_auth_params is None:
        return False
    if param in model_info.configurable_clientside_auth_params:
        return True
    return False
 def is_request_body_safe(
    request_body: dict, general_settings: dict, llm_router: Optional[Router], model: str
 ) -> bool:
    """
    Check if the request body is safe.
@ -88,7 +123,20 @@ def is_request_body_safe(request_body: dict) -> bool:
                request_body=request_body
            )
        ):
-            raise ValueError(f"BadRequest: {param} is not allowed in request body")
+            if general_settings.get("allow_client_side_credentials") is True:
                return True
            elif (
                _allow_model_level_clientside_configurable_parameters(
                    model=model, param=param, llm_router=llm_router
                )
                is True
            ):
                return True
            raise ValueError(
                f"Rejected Request: {param} is not allowed in request body. "
                "Enable with `general_settings::allow_client_side_credentials` on proxy config.yaml. "
                "Relevant Issue: https://huntr.com/bounties/4001e1a2-7b7a-4776-a3ae-e6692ec3d997",
            )
    return True
@ -110,13 +158,20 @@ async def pre_db_read_auth_checks(
    Raises:
    - HTTPException if request fails initial auth checks
    """
-    from litellm.proxy.proxy_server import general_settings, premium_user
+    from litellm.proxy.proxy_server import general_settings, llm_router, premium_user
    # Check 1. request size
    await check_if_request_size_is_safe(request=request)
    # Check 2. Request body is safe
-    is_request_body_safe(request_body=request_data)
+    is_request_body_safe(
        request_body=request_data,
        general_settings=general_settings,
        llm_router=llm_router,
        model=request_data.get(
            "model", ""
        ),  # [TODO] use model passed in url as well (azure openai routes)
    )
    # Check 3. Check if IP address is allowed
    is_valid_ip, passed_in_ip = _check_valid_ip(
--- a/litellm/proxy/route_llm_request.py
+++ b/litellm/proxy/route_llm_request.py
@ -66,7 +66,7 @@ async def route_request(
    """
    router_model_names = llm_router.model_names if llm_router is not None else []
-    if "api_key" in data:
+    if "api_key" in data or "api_base" in data:
        return getattr(litellm, f"{route_type}")(**data)
    elif "user_config" in data:
--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@ -4,6 +4,8 @@ import secrets
 import traceback
 from typing import Optional
 from pydantic import BaseModel
 import litellm
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy._types import SpendLogsMetadata, SpendLogsPayload
@ -105,6 +107,8 @@ def get_logging_payload(
    additional_usage_values = {}
    for k, v in usage.items():
        if k not in special_usage_fields:
            if isinstance(v, BaseModel):
                v = v.model_dump()
            additional_usage_values.update({k: v})
    clean_metadata["additional_usage_values"] = additional_usage_values
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -14,7 +14,17 @@ from datetime import datetime, timedelta
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from functools import wraps
-from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union, overload
+from typing import (
    TYPE_CHECKING,
    Any,
    List,
    Literal,
    Optional,
    Tuple,
    Union,
    get_args,
    overload,
 )
 import backoff
 import httpx
@ -222,19 +232,7 @@ class ProxyLogging:
        self.cache_control_check = _PROXY_CacheControlCheck()
        self.alerting: Optional[List] = None
        self.alerting_threshold: float = 300  # default to 5 min. threshold
-        self.alert_types: List[AlertType] = [
+        self.alert_types: List[AlertType] = list(get_args(AlertType))
            "llm_exceptions",
            "llm_too_slow",
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
            "daily_reports",
            "spend_reports",
            "fallback_reports",
            "cooldown_deployment",
            "new_model_added",
            "outage_alerts",
        ]
        self.alert_to_webhook_url: Optional[dict] = None
        self.slack_alerting_instance: SlackAlerting = SlackAlerting(
            alerting_threshold=self.alerting_threshold,
--- a/litellm/router.py
+++ b/litellm/router.py
@ -4335,11 +4335,28 @@ class Router:
        total_tpm: Optional[int] = None
        total_rpm: Optional[int] = None
        configurable_clientside_auth_params: Optional[List[str]] = None
        for model in self.model_list:
-            if "model_name" in model and model["model_name"] == model_group:
+            is_match = False
            if (
                "model_name" in model and model["model_name"] == model_group
            ):  # exact match
                is_match = True
            elif (
                "model_name" in model
                and model_group in self.provider_default_deployments
            ):  # wildcard model
                is_match = True
            if not is_match:
                continue
            # model in model group found #
            litellm_params = LiteLLM_Params(**model["litellm_params"])
            # get configurable clientside auth params
            configurable_clientside_auth_params = (
                litellm_params.configurable_clientside_auth_params
            )
            # get model tpm
            _deployment_tpm: Optional[int] = None
            if _deployment_tpm is None:
@ -4425,9 +4442,7 @@ class Router:
                        > model_group_info.max_input_tokens
                    )
                ):
-                        model_group_info.max_input_tokens = model_info[
+                    model_group_info.max_input_tokens = model_info["max_input_tokens"]
                            "max_input_tokens"
                        ]
                if (
                    model_info.get("max_output_tokens", None) is not None
                    and model_info["max_output_tokens"] is not None
@ -4437,9 +4452,7 @@ class Router:
                        > model_group_info.max_output_tokens
                    )
                ):
-                        model_group_info.max_output_tokens = model_info[
+                    model_group_info.max_output_tokens = model_info["max_output_tokens"]
                            "max_output_tokens"
                        ]
                if model_info.get("input_cost_per_token", None) is not None and (
                    model_group_info.input_cost_per_token is None
                    or model_info["input_cost_per_token"]
@ -4480,13 +4493,20 @@ class Router:
                        "supported_openai_params"
                    ]
        if model_group_info is not None:
            ## UPDATE WITH TOTAL TPM/RPM FOR MODEL GROUP
-        if total_tpm is not None and model_group_info is not None:
+            if total_tpm is not None:
                model_group_info.tpm = total_tpm
-        if total_rpm is not None and model_group_info is not None:
+            if total_rpm is not None:
                model_group_info.rpm = total_rpm
            ## UPDATE WITH CONFIGURABLE CLIENTSIDE AUTH PARAMS FOR MODEL GROUP
            if configurable_clientside_auth_params is not None:
                model_group_info.configurable_clientside_auth_params = (
                    configurable_clientside_auth_params
                )
        return model_group_info
    def get_model_group_info(self, model_group: str) -> Optional[ModelGroupInfo]:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -141,9 +141,16 @@ def test_completion_azure_ai_command_r():
        os.environ["AZURE_AI_API_BASE"] = os.getenv("AZURE_COHERE_API_BASE", "")
        os.environ["AZURE_AI_API_KEY"] = os.getenv("AZURE_COHERE_API_KEY", "")
-        response: litellm.ModelResponse = completion(
+        response = completion(
            model="azure_ai/command-r-plus",
-            messages=[{"role": "user", "content": "What is the meaning of life?"}],
+            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What is the meaning of life?"}
                    ],
                }
            ],
        )  # type: ignore
        assert "azure_ai" in response.model
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -1257,14 +1257,31 @@ def test_completion_cost_databricks_embedding(model):
    cost = completion_cost(completion_response=resp)
-def test_completion_cost_fireworks_ai():
+from litellm.llms.fireworks_ai.cost_calculator import get_base_model_for_pricing
@pytest.mark.parametrize(
    "model, base_model",
    [
        ("fireworks_ai/llama-v3p1-405b-instruct", "fireworks-ai-default"),
        ("fireworks_ai/mixtral-8x7b-instruct", "fireworks-ai-moe-up-to-56b"),
    ],
 )
 def test_get_model_params_fireworks_ai(model, base_model):
    pricing_model = get_base_model_for_pricing(model_name=model)
    assert base_model == pricing_model
@pytest.mark.parametrize(
    "model",
    ["fireworks_ai/llama-v3p1-405b-instruct", "fireworks_ai/mixtral-8x7b-instruct"],
 )
 def test_completion_cost_fireworks_ai(model):
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    messages = [{"role": "user", "content": "Hey, how's it going?"}]
-    resp = litellm.completion(
+    resp = litellm.completion(model=model, messages=messages)  # works fine
        model="fireworks_ai/mixtral-8x7b-instruct", messages=messages
    )  # works fine
    print(resp)
    cost = completion_cost(completion_response=resp)
--- a/litellm/tests/test_proxy_utils.py
+++ b/litellm/tests/test_proxy_utils.py
@ -12,6 +12,7 @@ sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 from litellm.proxy._types import LitellmUserRoles, UserAPIKeyAuth
 from litellm.proxy.auth.auth_utils import is_request_body_safe
 from litellm.proxy.litellm_pre_call_utils import (
    _get_dynamic_logging_metadata,
    add_litellm_data_to_request,
@ -291,3 +292,78 @@ def test_dynamic_logging_metadata_key_and_team_metadata(callback_vars):
    for var in callbacks.callback_vars.values():
        assert "os.environ" not in var
@pytest.mark.parametrize(
    "allow_client_side_credentials, expect_error", [(True, False), (False, True)]
 )
 def test_is_request_body_safe_global_enabled(
    allow_client_side_credentials, expect_error
 ):
    from litellm import Router
    error_raised = False
    llm_router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",
                "litellm_params": {
                    "model": "gpt-3.5-turbo",
                    "api_key": os.getenv("OPENAI_API_KEY"),
                },
            }
        ]
    )
    try:
        is_request_body_safe(
            request_body={"api_base": "hello-world"},
            general_settings={
                "allow_client_side_credentials": allow_client_side_credentials
            },
            llm_router=llm_router,
            model="gpt-3.5-turbo",
        )
    except Exception as e:
        print(e)
        error_raised = True
    assert expect_error == error_raised
@pytest.mark.parametrize(
    "allow_client_side_credentials, expect_error", [(True, False), (False, True)]
 )
 def test_is_request_body_safe_model_enabled(
    allow_client_side_credentials, expect_error
 ):
    from litellm import Router
    error_raised = False
    llm_router = Router(
        model_list=[
            {
                "model_name": "fireworks_ai/*",
                "litellm_params": {
                    "model": "fireworks_ai/*",
                    "api_key": os.getenv("FIREWORKS_API_KEY"),
                    "configurable_clientside_auth_params": (
                        ["api_base"] if allow_client_side_credentials else []
                    ),
                },
            }
        ]
    )
    try:
        is_request_body_safe(
            request_body={"api_base": "hello-world"},
            general_settings={},
            llm_router=llm_router,
            model="fireworks_ai/my-new-model",
        )
    except Exception as e:
        print(e)
        error_raised = True
    assert expect_error == error_raised
--- a/litellm/types/llms/vertex_ai.py
+++ b/litellm/types/llms/vertex_ai.py
@ -283,7 +283,7 @@ class PromptFeedback(TypedDict):
 class GenerateContentResponseBody(TypedDict, total=False):
-    candidates: Required[List[Candidates]]
+    candidates: List[Candidates]
    promptFeedback: PromptFeedback
    usageMetadata: Required[UsageMetadata]
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -139,6 +139,7 @@ class GenericLiteLLMParams(BaseModel):
    )
    max_retries: Optional[int] = None
    organization: Optional[str] = None  # for openai orgs
    configurable_clientside_auth_params: Optional[List[str]] = None
    ## UNIFIED PROJECT/REGION ##
    region_name: Optional[str] = None
    ## VERTEX AI ##
@ -310,6 +311,9 @@ class LiteLLMParamsTypedDict(TypedDict, total=False):
    stream_timeout: Optional[Union[float, str]]
    max_retries: Optional[int]
    organization: Optional[Union[List, str]]  # for openai orgs
    configurable_clientside_auth_params: Optional[
        List[str]
    ]  # for allowing api base switching on finetuned models
    ## DROP PARAMS ##
    drop_params: Optional[bool]
    ## UNIFIED PROJECT/REGION ##
@ -487,6 +491,7 @@ class ModelGroupInfo(BaseModel):
    supports_vision: bool = Field(default=False)
    supports_function_calling: bool = Field(default=False)
    supported_openai_params: Optional[List[str]] = Field(default=[])
    configurable_clientside_auth_params: Optional[List[str]] = None
 class AssistantsTypedDict(TypedDict):
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -1196,6 +1196,7 @@ all_litellm_params = [
    "client_id",
    "client_secret",
    "user_continue_message",
    "configurable_clientside_auth_params",
 ]
@ -1323,7 +1324,7 @@ class StandardLoggingPayload(TypedDict):
    metadata: StandardLoggingMetadata
    cache_hit: Optional[bool]
    cache_key: Optional[str]
-    saved_cache_cost: Optional[float]
+    saved_cache_cost: float
    request_tags: list
    end_user: Optional[str]
    requester_ip_address: Optional[str]
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -3862,9 +3862,9 @@
        "supports_vision": true
    },
    "anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192, 
+        "max_tokens": 4096, 
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3906,9 +3906,9 @@
        "supports_vision": true
    },
    "us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3939,9 +3939,9 @@
        "supports_vision": true
    },
    "eu.anthropic.claude-3-sonnet-20240229-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -3950,9 +3950,9 @@
        "supports_vision": true
    },
    "eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
-        "max_tokens": 8192,
+        "max_tokens": 4096,
        "max_input_tokens": 200000,
-        "max_output_tokens": 8192,
+        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000003,
        "output_cost_per_token": 0.000015,
        "litellm_provider": "bedrock",
@ -5593,6 +5593,11 @@
        "output_cost_per_token": 0.0000012,
        "litellm_provider": "fireworks_ai"
    },
    "fireworks-ai-default": {
        "input_cost_per_token": 0.0,
        "output_cost_per_token": 0.0,
        "litellm_provider": "fireworks_ai"
    },
    "fireworks-ai-embedding-up-to-150m": {
        "input_cost_per_token": 0.000000008,
        "output_cost_per_token": 0.000000,