diff --git a/docs/my-website/docs/proxy/tag_routing.md b/docs/my-website/docs/proxy/tag_routing.md
index 603c47fad1..4b2621fa8c 100644
--- a/docs/my-website/docs/proxy/tag_routing.md
+++ b/docs/my-website/docs/proxy/tag_routing.md
@@ -25,6 +25,13 @@ model_list:
       model: openai/gpt-4o
       api_key: os.environ/OPENAI_API_KEY
       tags: ["paid"] # 👈 Key Change
+  - model_name: gpt-4
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+      api_base: https://exampleopenaiendpoint-production.up.railway.app/
+      tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
+  
 
 router_settings:
   enable_tag_filtering: True # 👈 Key Change
@@ -136,6 +143,46 @@ Response
 }
 ```
 
+## Setting Default Tags 
+
+Use this if you want all untagged requests to be routed to specific deployments
+
+1. Set default tag on your yaml
+```yaml
+  model_list:
+    - model_name: fake-openai-endpoint
+      litellm_params:
+        model: openai/fake
+        api_key: fake-key
+        api_base: https://exampleopenaiendpoint-production.up.railway.app/
+        tags: ["default"] # 👈 Key Change - All untagged requests will get routed to this
+      model_info:
+        id: "default-model" # used for identifying model in response headers
+```
+
+2. Start proxy
+```shell
+$ litellm --config /path/to/config.yaml
+```
+
+3. Make request with no tags
+```shell
+curl -i http://localhost:4000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer sk-1234" \
+  -d '{
+    "model": "fake-openai-endpoint",
+    "messages": [
+      {"role": "user", "content": "Hello, Claude gm!"}
+    ]
+  }'
+```
+
+Expect to see the following response header when this works
+```shell
+x-litellm-model-id: default-model
+```
+
 ## ✨ Team based tag routing (Enterprise)
 
 LiteLLM Proxy supports team-based tag routing, allowing you to associate specific tags with teams and route requests accordingly. Example **Team A can access gpt-4 deployment A, Team B can access gpt-4 deployment B** (LLM Access Control For Teams)
@@ -170,6 +217,12 @@ Here's how to set up and use team-based tag routing using curl commands:
         tags: ["teamB"] # 👈 Key Change
       model_info:
         id: "team-b-model" # used for identifying model in response headers
+    - model_name: fake-openai-endpoint
+      litellm_params:
+        model: openai/fake
+        api_key: fake-key
+        api_base: https://exampleopenaiendpoint-production.up.railway.app/
+        tags: ["default"] # OPTIONAL - All untagged requests will get routed to this
 
   router_settings:
     enable_tag_filtering: True # 👈 Key Change
diff --git a/docs/my-website/docs/proxy/team_logging.md b/docs/my-website/docs/proxy/team_logging.md
index c593f23bf5..fb177da761 100644
--- a/docs/my-website/docs/proxy/team_logging.md
+++ b/docs/my-website/docs/proxy/team_logging.md
@@ -208,8 +208,8 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
 -d '{
     "metadata": {
         "logging": [{
-            "callback_name": "langfuse", # 'otel', 'langfuse', 'lunary'
-            "callback_type": "success" # set, if required by integration - future improvement, have logging tools work for success + failure by default 
+            "callback_name": "langfuse", # "otel", "langfuse", "lunary"
+            "callback_type": "success", # "success", "failure", "success_and_failure"
             "callback_vars": {
                 "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", # [RECOMMENDED] reference key in proxy environment
                 "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", # [RECOMMENDED] reference key in proxy environment
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 25cae83282..cf13edce40 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -118,7 +118,7 @@ in_memory_llm_clients_cache: dict = {}
 safe_memory_mode: bool = False
 enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
-AZURE_DEFAULT_API_VERSION = "2024-07-01-preview"  # this is updated to the latest
+AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
 COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
 ### GUARDRAILS ###
@@ -483,7 +483,12 @@ openai_compatible_providers: List = [
     "azure_ai",
     "github",
 ]
-
+openai_text_completion_compatible_providers: List = (
+    [  # providers that support `/v1/completions`
+        "together_ai",
+        "fireworks_ai",
+    ]
+)
 
 # well supported replicate llms
 replicate_models: List = [
@@ -863,7 +868,7 @@ from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic.chat import AnthropicConfig
 from .llms.anthropic.completion import AnthropicTextConfig
-from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
+from .llms.databricks.chat import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere.completion import CohereConfig
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index 1eb4d0eb94..bcec062de1 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -22,6 +22,9 @@ from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_cha
 from litellm.llms.anthropic.cost_calculation import (
     cost_per_token as anthropic_cost_per_token,
 )
+from litellm.llms.databricks.cost_calculator import (
+    cost_per_token as databricks_cost_per_token,
+)
 from litellm.rerank_api.types import RerankResponse
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@@ -159,7 +162,7 @@ def cost_per_token(
         _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
 
     model_without_prefix = model
-    model_parts = model.split("/")
+    model_parts = model.split("/", 1)
     if len(model_parts) > 1:
         model_without_prefix = model_parts[1]
     else:
@@ -212,6 +215,8 @@ def cost_per_token(
             )
     elif custom_llm_provider == "anthropic":
         return anthropic_cost_per_token(model=model, usage=usage_block)
+    elif custom_llm_provider == "databricks":
+        return databricks_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "gemini":
         return google_cost_per_token(
             model=model_without_prefix,
diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py
index e2c3d6f3b8..84e45e16b1 100644
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@@ -649,7 +649,9 @@ class OpenTelemetry(CustomLogger):
             return BatchSpanProcessor(
                 OTLPSpanExporterHTTP(
                     endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
-                )
+                ),
+                max_queue_size=100,
+                max_export_batch_size=100,
             )
         elif self.OTEL_EXPORTER == "otlp_grpc":
             verbose_logger.debug(
@@ -659,7 +661,9 @@ class OpenTelemetry(CustomLogger):
             return BatchSpanProcessor(
                 OTLPSpanExporterGRPC(
                     endpoint=self.OTEL_ENDPOINT, headers=_split_otel_headers
-                )
+                ),
+                max_queue_size=100,
+                max_export_batch_size=100,
             )
         else:
             verbose_logger.debug(
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index b1db82a775..43273224cb 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -2333,6 +2333,8 @@ def get_standard_logging_object_payload(
             completion_start_time_float = completion_start_time.timestamp()
         elif isinstance(completion_start_time, float):
             completion_start_time_float = completion_start_time
+        else:
+            completion_start_time_float = end_time_float
         # clean up litellm hidden params
         clean_hidden_params = StandardLoggingHiddenParams(
             model_id=None,
diff --git a/litellm/llms/AzureOpenAI/azure.py b/litellm/llms/AzureOpenAI/azure.py
index 70f13375d2..8da9ee063b 100644
--- a/litellm/llms/AzureOpenAI/azure.py
+++ b/litellm/llms/AzureOpenAI/azure.py
@@ -245,7 +245,10 @@ class AzureOpenAIConfig:
                 - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool
                 - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective.
                 """
-                if json_schema is not None:
+                if json_schema is not None and (
+                    (api_version_year <= "2024" and api_version_month < "08")
+                    or "gpt-4o" not in model
+                ):  # azure api version "2024-08-01-preview" onwards supports 'json_schema' only for gpt-4o
                     _tool_choice = ChatCompletionToolChoiceObjectParam(
                         type="function",
                         function=ChatCompletionToolChoiceFunctionParam(
diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py
index 8021ccd59e..ed4d199f67 100644
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@@ -1263,6 +1263,7 @@ class OpenAIChatCompletion(BaseLLM):
 
                 error_headers = getattr(e, "headers", None)
                 if response is not None and hasattr(response, "text"):
+                    error_headers = getattr(e, "headers", None)
                     raise OpenAIError(
                         status_code=500,
                         message=f"{str(e)}\n\nOriginal Response: {response.text}",
@@ -1800,12 +1801,11 @@ class OpenAITextCompletion(BaseLLM):
         headers: Optional[dict] = None,
     ):
         super().completion()
-        exception_mapping_worked = False
         try:
             if headers is None:
                 headers = self.validate_environment(api_key=api_key)
             if model is None or messages is None:
-                raise OpenAIError(status_code=422, message=f"Missing model or messages")
+                raise OpenAIError(status_code=422, message="Missing model or messages")
 
             if (
                 len(messages) > 0
diff --git a/litellm/llms/azure_text.py b/litellm/llms/azure_text.py
index 9a8d462e56..db8c516b26 100644
--- a/litellm/llms/azure_text.py
+++ b/litellm/llms/azure_text.py
@@ -162,11 +162,10 @@ class AzureTextCompletion(BaseLLM):
         client=None,
     ):
         super().completion()
-        exception_mapping_worked = False
         try:
             if model is None or messages is None:
                 raise AzureOpenAIError(
-                    status_code=422, message=f"Missing model or messages"
+                    status_code=422, message="Missing model or messages"
                 )
 
             max_retries = optional_params.pop("max_retries", 2)
@@ -293,7 +292,10 @@ class AzureTextCompletion(BaseLLM):
                             "api-version", api_version
                         )
 
-                response = azure_client.completions.create(**data, timeout=timeout)  # type: ignore
+                raw_response = azure_client.completions.with_raw_response.create(
+                    **data, timeout=timeout
+                )
+                response = raw_response.parse()
                 stringified_response = response.model_dump()
                 ## LOGGING
                 logging_obj.post_call(
@@ -380,13 +382,15 @@ class AzureTextCompletion(BaseLLM):
                     "complete_input_dict": data,
                 },
             )
-            response = await azure_client.completions.create(**data, timeout=timeout)
+            raw_response = await azure_client.completions.with_raw_response.create(
+                **data, timeout=timeout
+            )
+            response = raw_response.parse()
             return openai_text_completion_config.convert_to_chat_model_response_object(
                 response_object=response.model_dump(),
                 model_response_object=model_response,
             )
         except AzureOpenAIError as e:
-            exception_mapping_worked = True
             raise e
         except Exception as e:
             status_code = getattr(e, "status_code", 500)
diff --git a/litellm/llms/bedrock/chat.py b/litellm/llms/bedrock/chat.py
index 4192ae868c..ee09797ba2 100644
--- a/litellm/llms/bedrock/chat.py
+++ b/litellm/llms/bedrock/chat.py
@@ -736,7 +736,9 @@ class BedrockLLM(BaseAWSLLM):
 
         if (stream is not None and stream is True) and provider != "ai21":
             endpoint_url = f"{endpoint_url}/model/{modelId}/invoke-with-response-stream"
-            proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
+            proxy_endpoint_url = (
+                f"{proxy_endpoint_url}/model/{modelId}/invoke-with-response-stream"
+            )
         else:
             endpoint_url = f"{endpoint_url}/model/{modelId}/invoke"
             proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke"
@@ -1268,7 +1270,7 @@ class AmazonConverseConfig:
                     if len(value) == 0:  # converse raises error for empty strings
                         continue
                     value = [value]
-                optional_params["stop_sequences"] = value
+                optional_params["stopSequences"] = value
             if param == "temperature":
                 optional_params["temperature"] = value
             if param == "top_p":
diff --git a/litellm/llms/databricks.py b/litellm/llms/databricks/chat.py
similarity index 98%
rename from litellm/llms/databricks.py
rename to litellm/llms/databricks/chat.py
index 3cc1c24568..0421cd9e46 100644
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks/chat.py
@@ -29,8 +29,8 @@ from litellm.types.utils import (
 )
 from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
 
-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 
 
 class DatabricksError(Exception):
@@ -328,6 +328,7 @@ class DatabricksChatCompletion(BaseLLM):
         api_base: str,
         custom_prompt_dict: dict,
         model_response: ModelResponse,
+        custom_llm_provider: str,
         print_verbose: Callable,
         encoding,
         api_key,
@@ -371,6 +372,8 @@ class DatabricksChatCompletion(BaseLLM):
         )
         response = ModelResponse(**response_json)
 
+        response.model = custom_llm_provider + "/" + response.model
+
         if base_model is not None:
             response._hidden_params["model"] = base_model
         return response
@@ -472,6 +475,7 @@ class DatabricksChatCompletion(BaseLLM):
                     data=data,
                     api_base=api_base,
                     custom_prompt_dict=custom_prompt_dict,
+                    custom_llm_provider=custom_llm_provider,
                     model_response=model_response,
                     print_verbose=print_verbose,
                     encoding=encoding,
@@ -528,6 +532,8 @@ class DatabricksChatCompletion(BaseLLM):
 
         response = ModelResponse(**response_json)
 
+        response.model = custom_llm_provider + "/" + response.model
+
         if base_model is not None:
             response._hidden_params["model"] = base_model
 
diff --git a/litellm/llms/databricks/cost_calculator.py b/litellm/llms/databricks/cost_calculator.py
new file mode 100644
index 0000000000..3d40f2aa62
--- /dev/null
+++ b/litellm/llms/databricks/cost_calculator.py
@@ -0,0 +1,39 @@
+"""
+Helper util for handling databricks-specific cost calculation
+- e.g.: handling 'dbrx-instruct-*'
+"""
+
+from typing import Tuple
+
+from litellm.types.utils import Usage
+from litellm.utils import get_model_info
+
+
+def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - usage: LiteLLM Usage block, containing anthropic caching information
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+    """
+    base_model = model
+    if model.startswith("databricks/dbrx-instruct") or model.startswith(
+        "dbrx-instruct"
+    ):
+        base_model = "databricks-dbrx-instruct"
+
+    ## GET MODEL INFO
+    model_info = get_model_info(model=base_model, custom_llm_provider="databricks")
+
+    ## CALCULATE INPUT COST
+
+    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
+
+    ## CALCULATE OUTPUT COST
+    completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
+
+    return prompt_cost, completion_cost
diff --git a/litellm/llms/sagemaker/sagemaker.py b/litellm/llms/sagemaker/sagemaker.py
index cbf1a9f62b..a7b36134b5 100644
--- a/litellm/llms/sagemaker/sagemaker.py
+++ b/litellm/llms/sagemaker/sagemaker.py
@@ -273,7 +273,7 @@ class SagemakerLLM(BaseAWSLLM):
         model_id = optional_params.get("model_id", None)
 
         if use_messages_api is True:
-            from litellm.llms.databricks import DatabricksChatCompletion
+            from litellm.llms.databricks.chat import DatabricksChatCompletion
 
             openai_like_chat_completions = DatabricksChatCompletion()
             inference_params["stream"] = True if stream is True else False
diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py
index 69909765e8..c30fa900f0 100644
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py
@@ -80,7 +80,7 @@ class VertexAIPartnerModels(BaseLLM):
             import vertexai
             from google.cloud import aiplatform
 
-            from litellm.llms.databricks import DatabricksChatCompletion
+            from litellm.llms.databricks.chat import DatabricksChatCompletion
             from litellm.llms.OpenAI.openai import OpenAIChatCompletion
             from litellm.llms.text_completion_codestral import CodestralTextCompletion
             from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
diff --git a/litellm/main.py b/litellm/main.py
index cb35556191..1d20cf4240 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -92,7 +92,7 @@ from .llms.cohere import chat as cohere_chat
 from .llms.cohere import completion as cohere_completion  # type: ignore
 from .llms.cohere import embed as cohere_embed
 from .llms.custom_llm import CustomLLM, custom_chat_llm_router
-from .llms.databricks import DatabricksChatCompletion
+from .llms.databricks.chat import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
 from .llms.OpenAI.audio_transcriptions import OpenAIAudioTranscription
 from .llms.OpenAI.openai import OpenAIChatCompletion, OpenAITextCompletion
@@ -1013,7 +1013,10 @@ def completion(
             api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE")
 
             api_version = (
-                api_version or litellm.api_version or get_secret("AZURE_API_VERSION")
+                api_version
+                or litellm.api_version
+                or get_secret("AZURE_API_VERSION")
+                or litellm.AZURE_DEFAULT_API_VERSION
             )
 
             api_key = (
@@ -1209,6 +1212,9 @@ def completion(
             custom_llm_provider == "text-completion-openai"
             or "ft:babbage-002" in model
             or "ft:davinci-002" in model  # support for finetuned completion models
+            or custom_llm_provider
+            in litellm.openai_text_completion_compatible_providers
+            and kwargs.get("text_completion") is True
         ):
             openai.api_type = "openai"
 
@@ -4099,8 +4105,8 @@ def text_completion(
 
     kwargs.pop("prompt", None)
 
-    if (
-        _model is not None and custom_llm_provider == "openai"
+    if _model is not None and (
+        custom_llm_provider == "openai"
     ):  # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
         if _model not in litellm.open_ai_chat_completion_models:
             model = "text-completion-openai/" + _model
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 487e187a3c..912c968311 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -2512,16 +2512,16 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30, 
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.000000075,
+        "input_cost_per_token_above_128k_tokens": 0.00000015,
+        "output_cost_per_token": 0.0000003,
+        "output_cost_per_token_above_128k_tokens": 0.0000006,
         "litellm_provider": "gemini",
         "mode": "chat",
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
     },
     "gemini/gemini-1.5-flash-latest": {
         "max_tokens": 8192,
@@ -2533,16 +2533,16 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30, 
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.000000075,
+        "input_cost_per_token_above_128k_tokens": 0.00000015,
+        "output_cost_per_token": 0.0000003,
+        "output_cost_per_token_above_128k_tokens": 0.0000006,
         "litellm_provider": "gemini",
         "mode": "chat",
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
     },
     "gemini/gemini-pro": {
         "max_tokens": 8192,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index 335e934475..bf86da1e12 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,16 +1,9 @@
 model_list:
-  - model_name: "anthropic/claude-3-5-sonnet-20240620"
+  - model_name: "gpt-turbo"
     litellm_params:
-      model: anthropic/claude-3-5-sonnet-20240620
-      # api_base: http://0.0.0.0:9000 
-  - model_name: gpt-3.5-turbo
-    litellm_params:
-      model: openai/*
+      model: azure/chatgpt-v-2
+      api_key: os.environ/AZURE_API_KEY
+      api_base: os.environ/AZURE_API_BASE
 
-litellm_settings:
-  success_callback: ["s3"]
-  s3_callback_params:
-    s3_bucket_name: litellm-logs   # AWS Bucket Name for S3
-    s3_region_name: us-west-2              # AWS Region Name for S3
-    s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
-    s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
\ No newline at end of file
+router_settings:
+  model_group_alias: {"gpt-4": "gpt-turbo"} 
\ No newline at end of file
diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py
index c2b240ea65..3559a4792f 100644
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@@ -242,6 +242,9 @@ class LiteLLMRoutes(enum.Enum):
         "/v1/models",
         # token counter
         "/utils/token_counter",
+        # rerank
+        "/rerank",
+        "/v1/rerank",
     ]
 
     mapped_pass_through_routes: List = [
diff --git a/litellm/proxy/health_check.py b/litellm/proxy/health_check.py
index ff5ed7bfb7..215d2d8d60 100644
--- a/litellm/proxy/health_check.py
+++ b/litellm/proxy/health_check.py
@@ -3,7 +3,7 @@
 import asyncio
 import logging
 import random
-from typing import Optional
+from typing import List, Optional
 
 import litellm
 from litellm._logging import print_verbose
@@ -36,6 +36,25 @@ def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
     )
 
 
+def filter_deployments_by_id(
+    model_list: List,
+) -> List:
+    seen_ids = set()
+    filtered_deployments = []
+
+    for deployment in model_list:
+        _model_info = deployment.get("model_info") or {}
+        _id = _model_info.get("id") or None
+        if _id is None:
+            continue
+
+        if _id not in seen_ids:
+            seen_ids.add(_id)
+            filtered_deployments.append(deployment)
+
+    return filtered_deployments
+
+
 async def _perform_health_check(model_list: list, details: Optional[bool] = True):
     """
     Perform a health check for each model in the list.
@@ -105,6 +124,9 @@ async def perform_health_check(
             _new_model_list = [x for x in model_list if x["model_name"] == model]
         model_list = _new_model_list
 
+    model_list = filter_deployments_by_id(
+        model_list=model_list
+    )  # filter duplicate deployments (e.g. when model alias'es are used)
     healthy_endpoints, unhealthy_endpoints = await _perform_health_check(
         model_list, details
     )
diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py
index d41aae50f6..890c576c94 100644
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@@ -86,10 +86,11 @@ def convert_key_logging_metadata_to_callback(
             team_callback_settings_obj.success_callback = []
         if team_callback_settings_obj.failure_callback is None:
             team_callback_settings_obj.failure_callback = []
+
         if data.callback_name not in team_callback_settings_obj.success_callback:
             team_callback_settings_obj.success_callback.append(data.callback_name)
 
-        if data.callback_name in team_callback_settings_obj.failure_callback:
+        if data.callback_name not in team_callback_settings_obj.failure_callback:
             team_callback_settings_obj.failure_callback.append(data.callback_name)
 
     for var, value in data.callback_vars.items():
diff --git a/litellm/proxy/management_helpers/utils.py b/litellm/proxy/management_helpers/utils.py
index efbe667fb6..af8e852013 100644
--- a/litellm/proxy/management_helpers/utils.py
+++ b/litellm/proxy/management_helpers/utils.py
@@ -109,8 +109,8 @@ async def add_new_member(
                 where={"user_id": user_info.user_id},  # type: ignore
                 data={"teams": {"push": [team_id]}},
             )
-
-            returned_user = LiteLLM_UserTable(**_returned_user.model_dump())
+            if _returned_user is not None:
+                returned_user = LiteLLM_UserTable(**_returned_user.model_dump())
         elif len(existing_user_row) > 1:
             raise HTTPException(
                 status_code=400,
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index e385a23d7e..ad7fbd384e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -1,19 +1,19 @@
 model_list:
   - model_name: openai/*
     litellm_params:
-      model: gpt-3.5-turbo
+      model: openai/*
       api_key: os.environ/OPENAI_API_KEY
-
-litellm_settings:
-  success_callback: ["prometheus"]
-  failure_callback: ["prometheus"]
-
-guardrails:
-  - guardrail_name: "presidio-pre-guard"
+    model_info:
+      id: "good-openai"
+  - model_name: openai/*
     litellm_params:
-      guardrail: presidio  # supported values: "aporia", "lakera", "presidio"
-      mode: "pre_call"  # pre_call, during_call, post_call
-      output_parse_pii: True
+      model: openai/*
+      api_key: os.environ/non-exsitent-env-var
+      tags: ["bad-model"]
+    model_info:
+      id: "test-openai"
+
+
 
 litellm_settings:
   callbacks: ["otel"]
@@ -22,8 +22,16 @@ callback_settings:
   otel:
     message_logging: False
 
+router_settings:
+ enable_tag_filtering: True # 👈 Key Chang
+
+
 general_settings: 
  master_key: sk-1234 
  alerting: ["slack"]
  spend_report_frequency: "1d"
 
+
+litellm_settings:
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
\ No newline at end of file
diff --git a/litellm/router.py b/litellm/router.py
index bcd0b6221d..5a01f4f395 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -3690,7 +3690,7 @@ class Router:
             exception=original_exception,
         )
 
-        allowed_fails = _allowed_fails or self.allowed_fails
+        allowed_fails = _allowed_fails if _allowed_fails is not None else self.allowed_fails
 
         dt = get_utc_datetime()
         current_minute = dt.strftime("%H-%M")
@@ -4556,6 +4556,27 @@ class Router:
                     ids.append(id)
         return ids
 
+    def _get_all_deployments(
+        self, model_name: str, model_alias: Optional[str] = None
+    ) -> List[DeploymentTypedDict]:
+        """
+        Return all deployments of a model name
+
+        Used for accurate 'get_model_list'.
+        """
+
+        returned_models: List[DeploymentTypedDict] = []
+        for model in self.model_list:
+            if model["model_name"] == model_name:
+                if model_alias is not None:
+                    alias_model = copy.deepcopy(model)
+                    alias_model["model_name"] = model_name
+                    returned_models.append(alias_model)
+                else:
+                    returned_models.append(model)
+
+        return returned_models
+
     def get_model_names(self) -> List[str]:
         """
         Returns all possible model names for router.
@@ -4567,15 +4588,18 @@ class Router:
     def get_model_list(
         self, model_name: Optional[str] = None
     ) -> Optional[List[DeploymentTypedDict]]:
+        """
+        Includes router model_group_alias'es as well
+        """
         if hasattr(self, "model_list"):
             returned_models: List[DeploymentTypedDict] = []
 
             for model_alias, model_value in self.model_group_alias.items():
-                model_alias_item = DeploymentTypedDict(
-                    model_name=model_alias,
-                    litellm_params=LiteLLMParamsTypedDict(model=model_value),
+                returned_models.extend(
+                    self._get_all_deployments(
+                        model_name=model_value, model_alias=model_alias
+                    )
                 )
-                returned_models.append(model_alias_item)
 
             if model_name is None:
                 returned_models += self.model_list
@@ -4583,8 +4607,7 @@ class Router:
                 return returned_models
 
             for model in self.model_list:
-                if model["model_name"] == model_name:
-                    returned_models.append(model)
+                returned_models.extend(self._get_all_deployments(model_name=model_name))
 
             return returned_models
         return None
diff --git a/litellm/router_strategy/tag_based_routing.py b/litellm/router_strategy/tag_based_routing.py
index ed350109c1..78bc5e4f9f 100644
--- a/litellm/router_strategy/tag_based_routing.py
+++ b/litellm/router_strategy/tag_based_routing.py
@@ -1,5 +1,9 @@
 """
-Use this to route requests between free and paid tiers
+Use this to route requests between Teams
+
+- If tags in request is a subset of tags in deployment, return deployment
+- if deployments are set with default tags, return all default deployment
+- If no default_deployments are set, return all deployments
 """
 
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -25,14 +29,14 @@ async def get_deployments_for_tag(
 
     if request_kwargs is None:
         verbose_logger.debug(
-            "get_deployments_for_tier: request_kwargs is None returning healthy_deployments: %s",
+            "get_deployments_for_tag: request_kwargs is None returning healthy_deployments: %s",
             healthy_deployments,
         )
         return healthy_deployments
 
     if healthy_deployments is None:
         verbose_logger.debug(
-            "get_deployments_for_tier: healthy_deployments is None returning healthy_deployments"
+            "get_deployments_for_tag: healthy_deployments is None returning healthy_deployments"
         )
         return healthy_deployments
 
@@ -43,7 +47,9 @@ async def get_deployments_for_tag(
 
         new_healthy_deployments = []
         if request_tags:
-            verbose_logger.debug("parameter routing: router_keys: %s", request_tags)
+            verbose_logger.debug(
+                "get_deployments_for_tag routing: router_keys: %s", request_tags
+            )
             # example this can be router_keys=["free", "custom"]
             # get all deployments that have a superset of these router keys
             for deployment in healthy_deployments:
@@ -66,9 +72,26 @@ async def get_deployments_for_tag(
                         request_tags,
                     )
                     new_healthy_deployments.append(deployment)
+                elif "default" in deployment_tags:
+                    verbose_logger.debug(
+                        "adding default deployment with tags: %s, request tags: %s",
+                        deployment_tags,
+                        request_tags,
+                    )
+                    new_healthy_deployments.append(deployment)
 
             return new_healthy_deployments
 
+    # for Untagged requests use default deployments if set
+    _default_deployments_with_tags = []
+    for deployment in healthy_deployments:
+        if "default" in deployment.get("litellm_params", {}).get("tags", []):
+            _default_deployments_with_tags.append(deployment)
+
+    if len(_default_deployments_with_tags) > 0:
+        return _default_deployments_with_tags
+
+    # if no default deployment is found, return healthy_deployments
     verbose_logger.debug(
         "no tier found in metadata, returning healthy_deployments: %s",
         healthy_deployments,
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 619d2ab5d3..3adf3bbee9 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -626,6 +626,8 @@ async def test_model_function_invoke(model, sync_mode, api_key, api_base):
             response = await litellm.acompletion(**data)
 
         print(f"response: {response}")
+    except litellm.InternalServerError:
+        pass
     except litellm.RateLimitError as e:
         pass
     except Exception as e:
@@ -889,18 +891,29 @@ def encode_image(image_path):
         return base64.b64encode(image_file.read()).decode("utf-8")
 
 
-@pytest.mark.skip(
-    reason="we already test claude-3, this is just another way to pass images"
-)
-def test_completion_claude_3_base64():
+@pytest.mark.parametrize(
+    "model",
+    [
+        "gpt-4o",
+        "azure/gpt-4o",
+        "anthropic/claude-3-opus-20240229",
+    ],
+)  #
+def test_completion_base64(model):
     try:
+        import base64
+
+        import requests
+
         litellm.set_verbose = True
-        litellm.num_retries = 3
-        image_path = "../proxy/cached_logo.jpg"
-        # Getting the base64 string
-        base64_image = encode_image(image_path)
+        url = "https://dummyimage.com/100/100/fff&text=Test+image"
+        response = requests.get(url)
+        file_data = response.content
+
+        encoded_file = base64.b64encode(file_data).decode("utf-8")
+        base64_image = f"data:image/png;base64,{encoded_file}"
         resp = litellm.completion(
-            model="anthropic/claude-3-opus-20240229",
+            model=model,
             messages=[
                 {
                     "role": "user",
@@ -908,9 +921,7 @@ def test_completion_claude_3_base64():
                         {"type": "text", "text": "Whats in this image?"},
                         {
                             "type": "image_url",
-                            "image_url": {
-                                "url": "data:image/jpeg;base64," + base64_image
-                            },
+                            "image_url": {"url": base64_image},
                         },
                     ],
                 }
@@ -919,7 +930,6 @@ def test_completion_claude_3_base64():
         print(f"\nResponse: {resp}")
 
         prompt_tokens = resp.usage.prompt_tokens
-        raise Exception("it worked!")
     except Exception as e:
         if "500 Internal error encountered.'" in str(e):
             pass
@@ -2174,15 +2184,16 @@ def test_completion_openai():
 
 
 @pytest.mark.parametrize(
-    "model",
+    "model, api_version",
     [
-        "gpt-4o-2024-08-06",
-        "azure/chatgpt-v-2",
-        "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+        ("gpt-4o-2024-08-06", None),
+        ("azure/chatgpt-v-2", None),
+        ("bedrock/anthropic.claude-3-sonnet-20240229-v1:0", None),
+        ("azure/gpt-4o", "2024-08-01-preview"),
     ],
 )
 @pytest.mark.flaky(retries=3, delay=1)
-def test_completion_openai_pydantic(model):
+def test_completion_openai_pydantic(model, api_version):
     try:
         litellm.set_verbose = True
         from pydantic import BaseModel
@@ -2207,6 +2218,7 @@ def test_completion_openai_pydantic(model):
                     messages=messages,
                     metadata={"hi": "bye"},
                     response_format=EventsList,
+                    api_version=api_version,
                 )
                 break
             except litellm.JSONSchemaValidationError:
@@ -3469,14 +3481,14 @@ def response_format_tests(response: litellm.ModelResponse):
 @pytest.mark.parametrize(
     "model",
     [
-        # "bedrock/cohere.command-r-plus-v1:0",
+        "bedrock/mistral.mistral-large-2407-v1:0",
+        "bedrock/cohere.command-r-plus-v1:0",
         "anthropic.claude-3-sonnet-20240229-v1:0",
-        # "anthropic.claude-instant-v1",
-        # "bedrock/ai21.j2-mid",
-        # "mistral.mistral-7b-instruct-v0:2",
+        "anthropic.claude-instant-v1",
+        "mistral.mistral-7b-instruct-v0:2",
         # "bedrock/amazon.titan-tg1-large",
-        # "meta.llama3-8b-instruct-v1:0",
-        # "cohere.command-text-v14",
+        "meta.llama3-8b-instruct-v1:0",
+        "cohere.command-text-v14",
     ],
 )
 @pytest.mark.parametrize("sync_mode", [True, False])
@@ -3491,6 +3503,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
                 messages=[{"role": "user", "content": "Hey! how's it going?"}],
                 temperature=0.2,
                 max_tokens=200,
+                stop=["stop sequence"],
             )
 
             assert isinstance(response, litellm.ModelResponse)
@@ -3502,6 +3515,7 @@ async def test_completion_bedrock_httpx_models(sync_mode, model):
                 messages=[{"role": "user", "content": "Hey! how's it going?"}],
                 temperature=0.2,
                 max_tokens=100,
+                stop=["stop sequence"],
             )
 
             assert isinstance(response, litellm.ModelResponse)
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index 55a5abbdd7..ed9eebedb1 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -1219,3 +1219,13 @@ def test_completion_cost_anthropic_prompt_caching():
     cost_2 = completion_cost(model=model, completion_response=response_2)
 
     assert cost_1 > cost_2
+
+
+def test_completion_cost_databricks():
+    model, messages = "databricks/databricks-dbrx-instruct", [
+        {"role": "user", "content": "What is 2+2?"}
+    ]
+
+    resp = litellm.completion(model=model, messages=messages)  # works fine
+
+    cost = completion_cost(completion_response=resp)
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 0388e026b9..a570692f6d 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -864,7 +864,7 @@ def _pre_call_utils(
         data["messages"] = [{"role": "user", "content": "Hello world"}]
         if streaming is True:
             data["stream"] = True
-        mapped_target = client.chat.completions.with_raw_response
+        mapped_target = client.chat.completions.with_raw_response  # type: ignore
         if sync_mode:
             original_function = litellm.completion
         else:
@@ -873,7 +873,7 @@ def _pre_call_utils(
         data["prompt"] = "Hello world"
         if streaming is True:
             data["stream"] = True
-        mapped_target = client.completions.with_raw_response
+        mapped_target = client.completions.with_raw_response  # type: ignore
         if sync_mode:
             original_function = litellm.text_completion
         else:
diff --git a/litellm/tests/test_function_calling.py b/litellm/tests/test_function_calling.py
index 79db9f1623..f30f713ead 100644
--- a/litellm/tests/test_function_calling.py
+++ b/litellm/tests/test_function_calling.py
@@ -52,6 +52,7 @@ def get_current_weather(location, unit="fahrenheit"):
         # "anthropic.claude-3-sonnet-20240229-v1:0",
     ],
 )
+@pytest.mark.flaky(retries=3, delay=1)
 def test_aaparallel_function_call(model):
     try:
         litellm.set_verbose = True
diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py
index fb1025ab26..102c126d11 100644
--- a/litellm/tests/test_proxy_server.py
+++ b/litellm/tests/test_proxy_server.py
@@ -1255,7 +1255,17 @@ async def test_add_callback_via_key(prisma_client):
 
 
 @pytest.mark.asyncio
-async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
+@pytest.mark.parametrize(
+    "callback_type, expected_success_callbacks, expected_failure_callbacks",
+    [
+        ("success", ["langfuse"], []),
+        ("failure", [], ["langfuse"]),
+        ("success_and_failure", ["langfuse"], ["langfuse"]),
+    ],
+)
+async def test_add_callback_via_key_litellm_pre_call_utils(
+    prisma_client, callback_type, expected_success_callbacks, expected_failure_callbacks
+):
     import json
 
     from fastapi import HTTPException, Request, Response
@@ -1312,7 +1322,7 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
                 "logging": [
                     {
                         "callback_name": "langfuse",
-                        "callback_type": "success",
+                        "callback_type": callback_type,
                         "callback_vars": {
                             "langfuse_public_key": "my-mock-public-key",
                             "langfuse_secret_key": "my-mock-secret-key",
@@ -1359,14 +1369,21 @@ async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client):
     }
 
     new_data = await add_litellm_data_to_request(**data)
+    print("NEW DATA: {}".format(new_data))
 
-    assert "success_callback" in new_data
-    assert new_data["success_callback"] == ["langfuse"]
     assert "langfuse_public_key" in new_data
     assert new_data["langfuse_public_key"] == "my-mock-public-key"
     assert "langfuse_secret_key" in new_data
     assert new_data["langfuse_secret_key"] == "my-mock-secret-key"
 
+    if expected_success_callbacks:
+        assert "success_callback" in new_data
+        assert new_data["success_callback"] == expected_success_callbacks
+
+    if expected_failure_callbacks:
+        assert "failure_callback" in new_data
+        assert new_data["failure_callback"] == expected_failure_callbacks
+
 
 @pytest.mark.asyncio
 async def test_gemini_pass_through_endpoint():
diff --git a/litellm/tests/test_router_tag_routing.py b/litellm/tests/test_router_tag_routing.py
index 67f100d794..f71a9b762d 100644
--- a/litellm/tests/test_router_tag_routing.py
+++ b/litellm/tests/test_router_tag_routing.py
@@ -91,3 +91,72 @@ async def test_router_free_paid_tier():
         print("response_extra_info: ", response_extra_info)
 
         assert response_extra_info["model_id"] == "very-expensive-model"
+
+
+@pytest.mark.asyncio()
+async def test_default_tagged_deployments():
+    """
+    - only use default deployment for untagged requests
+    - if a request has tag "default", use default deployment
+    """
+
+    router = litellm.Router(
+        model_list=[
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "tags": ["default"],
+                },
+                "model_info": {"id": "default-model"},
+            },
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4o",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                },
+                "model_info": {"id": "default-model-2"},
+            },
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4o-mini",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "tags": ["teamA"],
+                },
+                "model_info": {"id": "very-expensive-model"},
+            },
+        ],
+        enable_tag_filtering=True,
+    )
+
+    for _ in range(5):
+        # Untagged request, this should pick model with id == "default-model"
+        response = await router.acompletion(
+            model="gpt-4",
+            messages=[{"role": "user", "content": "Tell me a joke."}],
+        )
+
+        print("Response: ", response)
+
+        response_extra_info = response._hidden_params
+        print("response_extra_info: ", response_extra_info)
+
+        assert response_extra_info["model_id"] == "default-model"
+
+    for _ in range(5):
+        # requests tagged with "default", this should pick model with id == "default-model"
+        response = await router.acompletion(
+            model="gpt-4",
+            messages=[{"role": "user", "content": "Tell me a joke."}],
+            metadata={"tags": ["default"]},
+        )
+
+        print("Response: ", response)
+
+        response_extra_info = response._hidden_params
+        print("response_extra_info: ", response_extra_info)
+
+        assert response_extra_info["model_id"] == "default-model"
diff --git a/litellm/tests/test_text_completion.py b/litellm/tests/test_text_completion.py
index 70325c44e2..e6a4a0499c 100644
--- a/litellm/tests/test_text_completion.py
+++ b/litellm/tests/test_text_completion.py
@@ -4239,3 +4239,14 @@ def test_completion_vllm():
         mock_call.assert_called_once()
 
         assert "hello" in mock_call.call_args.kwargs["extra_body"]
+
+
+def test_completion_fireworks_ai_multiple_choices():
+    litellm.set_verbose = True
+    response = litellm.text_completion(
+        model="fireworks_ai/llama-v3p1-8b-instruct",
+        prompt=["halo", "hi", "halo", "hi"],
+    )
+    print(response.choices)
+
+    assert len(response.choices) == 4
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 487e187a3c..912c968311 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -2512,16 +2512,16 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30, 
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.000000075,
+        "input_cost_per_token_above_128k_tokens": 0.00000015,
+        "output_cost_per_token": 0.0000003,
+        "output_cost_per_token_above_128k_tokens": 0.0000006,
         "litellm_provider": "gemini",
         "mode": "chat",
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
     },
     "gemini/gemini-1.5-flash-latest": {
         "max_tokens": 8192,
@@ -2533,16 +2533,16 @@
         "max_audio_length_hours": 8.4,
         "max_audio_per_prompt": 1,
         "max_pdf_size_mb": 30, 
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.000000075,
+        "input_cost_per_token_above_128k_tokens": 0.00000015,
+        "output_cost_per_token": 0.0000003,
+        "output_cost_per_token_above_128k_tokens": 0.0000006,
         "litellm_provider": "gemini",
         "mode": "chat",
         "supports_system_messages": true,
         "supports_function_calling": true,
         "supports_vision": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
     },
     "gemini/gemini-pro": {
         "max_tokens": 8192,
diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml
index 57113d3509..b1d6b3dc66 100644
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@@ -148,6 +148,7 @@ router_settings:
   redis_password: os.environ/REDIS_PASSWORD
   redis_port: os.environ/REDIS_PORT
   enable_pre_call_checks: true
+  model_group_alias: {"my-special-fake-model-alias-name": "fake-openai-endpoint-3"} 
 
 general_settings: 
   master_key: sk-1234 # [OPTIONAL] Use to enforce auth on proxy. See - https://docs.litellm.ai/docs/proxy/virtual_keys
diff --git a/pyproject.toml b/pyproject.toml
index e07372d325..cf9a543095 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.44.22"
+version = "1.44.23"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.44.22"
+version = "1.44.23"
 version_files = [
     "pyproject.toml:^version"
 ]