Merge branch 'main' into feat/friendliai

2024-06-21 10:50:03 +09:00 · 2024-06-21 10:50:03 +09:00 · c4c7d1b367
commit c4c7d1b367
parent fcda16fe7f 3feaf231ac
201 changed files with 22438 additions and 13694 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -65,6 +65,7 @@ jobs:
            pip install "pydantic==2.7.1"
            pip install "diskcache==5.6.1"
            pip install "Pillow==10.3.0"
+            pip install "ijson==3.2.3"
      - save_cache:
          paths:
            - ./venv
@ -126,6 +127,7 @@ jobs:
            pip install jinja2
            pip install tokenizers
            pip install openai
+            pip install ijson
      - run:
          name: Run tests
          command: |
@ -180,6 +182,7 @@ jobs:
            pip install numpydoc
            pip install prisma            
            pip install fastapi            
+            pip install ijson            
            pip install "httpx==0.24.1"
            pip install "gunicorn==21.2.0"
            pip install "anyio==3.7.1"
--- a/.github/dependabot.yaml
+++ b/.github/dependabot.yaml
@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    groups:
+      github-actions:
+        patterns:
+          - "*"
--- a/.github/workflows/ghcr_deploy.yml
+++ b/.github/workflows/ghcr_deploy.yml
@ -25,6 +25,11 @@ jobs:
    if: github.repository == 'BerriAI/litellm'
    runs-on: ubuntu-latest
    steps:
+      -
+        name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_hash }}
      -
        name: Set up QEMU
        uses: docker/setup-qemu-action@v3
@ -41,12 +46,14 @@ jobs:
        name: Build and push
        uses: docker/build-push-action@v5
        with:
+          context: .
          push: true
          tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }} 
      -
        name: Build and push litellm-database image
        uses: docker/build-push-action@v5
        with:
+          context: .
          push: true
          file: Dockerfile.database
          tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
@ -54,6 +61,7 @@ jobs:
        name: Build and push litellm-spend-logs image
        uses: docker/build-push-action@v5
        with:
+          context: .
          push: true
          file: ./litellm-js/spend-logs/Dockerfile
          tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
@ -68,6 +76,8 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_hash }}
      # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
      - name: Log in to the Container registry
        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -92,7 +102,7 @@ jobs:
      - name: Build and push Docker image
        uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8
        with:
-          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
+          context: .
          push: true
          tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest'
          labels: ${{ steps.meta.outputs.labels }}
@ -106,6 +116,8 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_hash }}

      - name: Log in to the Container registry
        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -128,7 +140,7 @@ jobs:
      - name: Build and push Database Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
        with:
-          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
+          context: .
          file: Dockerfile.database
          push: true
          tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} 
@ -143,6 +155,8 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_hash }}

      - name: Log in to the Container registry
        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
@ -165,7 +179,7 @@ jobs:
      - name: Build and push Database Docker image
        uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
        with:
-          context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}}
+          context: .
          file: ./litellm-js/spend-logs/Dockerfile
          push: true
          tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }}
@ -176,6 +190,8 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.commit_hash }}

      - name: Log in to the Container registry
        uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
--- a/.gitignore
+++ b/.gitignore
@ -60,3 +60,4 @@ litellm/proxy/_experimental/out/404/index.html
 litellm/proxy/_experimental/out/model_hub/index.html
 litellm/proxy/_experimental/out/onboarding/index.html
 litellm/tests/log.txt
+litellm/tests/langfuse.log
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,4 +1,19 @@
 repos:
+-   repo: local
+    hooks:
+    -   id: mypy
+        name: mypy
+        entry: python3 -m mypy --ignore-missing-imports
+        language: system
+        types: [python]
+        files: ^litellm/
+    -   id: isort
+        name: isort
+        entry: isort
+        language: system
+        types: [python]
+        files: litellm/.*\.py
+        exclude: ^litellm/__init__.py$
 -   repo: https://github.com/psf/black
    rev: 24.2.0
    hooks:
@ -16,11 +31,10 @@ repos:
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
        language: system
-   repo: local
-    hooks:
-    -   id: mypy
-        name: mypy
-        entry: python3 -m mypy --ignore-missing-imports
-        language: system
-        types: [python]
-        files: ^litellm/
+    # -   id: check-file-length
+    #     name: Check file length
+    #     entry: python check_file_length.py
+    #     args: ["10000"]  # set your desired maximum number of lines
+    #     language: python
+    #     files: litellm/.*\.py
+    #     exclude: ^litellm/tests/
--- a/check_file_length.py
+++ b/check_file_length.py
@ -0,0 +1,28 @@
+import sys
+
+
+def check_file_length(max_lines, filenames):
+    bad_files = []
+    for filename in filenames:
+        with open(filename, "r") as file:
+            lines = file.readlines()
+            if len(lines) > max_lines:
+                bad_files.append((filename, len(lines)))
+    return bad_files
+
+
+if __name__ == "__main__":
+    max_lines = int(sys.argv[1])
+    filenames = sys.argv[2:]
+
+    bad_files = check_file_length(max_lines, filenames)
+    if bad_files:
+        bad_files.sort(
+            key=lambda x: x[1], reverse=True
+        )  # Sort files by length in descending order
+        for filename, length in bad_files:
+            print(f"{filename}: {length} lines")
+
+        sys.exit(1)
+    else:
+        sys.exit(0)
--- a/docs/my-website/docs/completion/drop_params.md
+++ b/docs/my-website/docs/completion/drop_params.md
@ -0,0 +1,110 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Drop Unsupported Params 
+
+Drop unsupported OpenAI params by your LLM Provider.
+
+## Quick Start 
+
+```python 
+import litellm 
+import os 
+
+# set keys 
+os.environ["COHERE_API_KEY"] = "co-.."
+
+litellm.drop_params = True # 👈 KEY CHANGE
+
+response = litellm.completion(
+                model="command-r",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                response_format={"key": "value"},
+            )
+```
+
+
+LiteLLM maps all supported openai params by provider + model (e.g. function calling is supported by anthropic on bedrock but not titan). 
+
+See `litellm.get_supported_openai_params("command-r")` [**Code**](https://github.com/BerriAI/litellm/blob/main/litellm/utils.py#L3584)
+
+If a provider/model doesn't support a particular param, you can drop it. 
+
+## OpenAI Proxy Usage
+
+```yaml
+litellm_settings:
+    drop_params: true
+```
+
+## Pass drop_params in `completion(..)`
+
+Just drop_params when calling specific models 
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python 
+import litellm 
+import os 
+
+# set keys 
+os.environ["COHERE_API_KEY"] = "co-.."
+
+response = litellm.completion(
+                model="command-r",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                response_format={"key": "value"},
+                drop_params=True
+            )
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+- litellm_params:
+    api_base: my-base
+    model: openai/my-model
+    drop_params: true # 👈 KEY CHANGE
+  model_name: my-model
+```
+</TabItem>
+</Tabs>
+
+## Specify params to drop 
+
+To drop specific params when calling a provider (E.g. 'logit_bias' for vllm)
+
+Use `additional_drop_params`
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+import litellm 
+import os 
+
+# set keys 
+os.environ["COHERE_API_KEY"] = "co-.."
+
+response = litellm.completion(
+                model="command-r",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                response_format={"key": "value"},
+                additional_drop_params=["response_format"]
+            )
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+```yaml
+- litellm_params:
+    api_base: my-base
+    model: openai/my-model
+    additional_drop_params: ["response_format"] # 👈 KEY CHANGE
+  model_name: my-model
+```
+</TabItem>
+</Tabs>
+
+**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model.
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -67,6 +67,10 @@ By default, LiteLLM raises an exception if the openai param being passed in isn'

 To drop the param instead, set `litellm.drop_params = True` or `completion(..drop_params=True)`.

+This **ONLY DROPS UNSUPPORTED OPENAI PARAMS**. 
+
+LiteLLM assumes any non-openai param is provider specific and passes it in as a kwarg in the request body
+
 ::: 

 ## Input Params
@ -162,7 +166,7 @@ def completion(

    - `function`: *object* - Required.

- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via {"type: "function", "function": {"name": "my_function"}} forces the model to call that function.
+- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via `{"type: "function", "function": {"name": "my_function"}}` forces the model to call that function.

    - `none` is the default when no functions are present. `auto` is the default if functions are present.

--- a/docs/my-website/docs/debugging/hosted_debugging.md
+++ b/docs/my-website/docs/debugging/hosted_debugging.md
@ -1,90 +0,0 @@
-import Image from '@theme/IdealImage';
-import QueryParamReader from '../../src/components/queryParamReader.js'
-
-# [Beta] Monitor Logs in Production
-
-:::note
-
-This is in beta. Expect frequent updates, as we improve based on your feedback.
-
-:::
-
-LiteLLM provides an integration to let you monitor logs in production.
-
-👉 Jump to our sample LiteLLM Dashboard: https://admin.litellm.ai/
-
-
-<Image img={require('../../img/alt_dashboard.png')} alt="Dashboard" />
-
-## Debug your first logs
-<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_OpenAI.ipynb">
-  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
-</a>
-
-
-### 1. Get your LiteLLM Token
-
-Go to [admin.litellm.ai](https://admin.litellm.ai/) and copy the code snippet with your unique token
-
-<Image img={require('../../img/hosted_debugger_usage_page.png')} alt="Usage" />
-
-### 2. Set up your environment
-
-**Add it to your .env**
-
-```python
-import os 
-
-os.env["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
-
-```
-
-**Turn on LiteLLM Client**
-```python
-import litellm 
-litellm.client = True
-```
-
-### 3. Make a normal `completion()` call
-```python
-import litellm 
-from litellm import completion
-import os 
-
-# set env variables
-os.environ["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token
-os.environ["OPENAI_API_KEY"] = "openai key"
-
-litellm.use_client = True # enable logging dashboard 
-messages = [{ "content": "Hello, how are you?","role": "user"}]
-
-# openai call
-response = completion(model="gpt-3.5-turbo", messages=messages)
-```
-
-Your `completion()` call print with a link to your session dashboard (https://admin.litellm.ai/<your_unique_token>)
-
-In the above case it would be: [`admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb`](https://admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb)
-
-Click on your personal dashboard link. Here's how you can find it 👇
-
-<Image img={require('../../img/dash_output.png')} alt="Dashboard" />
-
-[👋 Tell us if you need better privacy controls](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version?month=2023-08)  
-
-### 3. Review request log 
-
-Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider. 
-
-
-
-
-Ah! So we can see that this request was made to a **Baseten** (see litellm_params > custom_llm_provider) for a model with ID - **7qQNLDB** (see model). The message sent was - `"Hey, how's it going?"` and the response received was - `"As an AI language model, I don't have feelings or emotions, but I can assist you with your queries. How can I assist you today?"`
-
-<Image img={require('../../img/dashboard_log.png')} alt="Dashboard Log Row" />
-
-:::info
-
-🎉 Congratulations! You've successfully debugger your first log!
-
-:::
--- a/docs/my-website/docs/observability/athina_integration.md
+++ b/docs/my-website/docs/observability/athina_integration.md
@ -2,6 +2,15 @@ import Image from '@theme/IdealImage';

 # Athina

+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 [Athina](https://athina.ai/) is an evaluation framework and production monitoring platform for your LLM-powered app. Athina is designed to enhance the performance and reliability of AI applications through real-time monitoring, granular analytics, and plug-and-play evaluations.

 <Image img={require('../../img/athina_dashboard.png')} />
--- a/docs/my-website/docs/observability/greenscale_integration.md
+++ b/docs/my-website/docs/observability/greenscale_integration.md
@ -1,5 +1,14 @@
 # Greenscale - Track LLM Spend and Responsible Usage

+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 [Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII).

 ## Getting Started
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@ -1,4 +1,13 @@
 # Helicone Tutorial 
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 [Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage.

 ## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM)
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';

-# Langfuse - Logging LLM Input/Output
+# 🔥 Langfuse - Logging LLM Input/Output

 LangFuse is open Source Observability & Analytics for LLM Apps
 Detailed production traces and a granular view on quality, cost and latency
@ -122,10 +122,12 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
+      "parent_observation_id": "obs-id9"            # set langfuse Parent Observation ID
      "version":  "test-generation-version"         # set langfuse Generation Version
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
      "tags": ["tag1", "tag2"],                     # set langfuse Tags
+      "trace_name": "new-trace-name"                # set langfuse Trace Name
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_metadata": {"key": "value"},           # set langfuse Trace Metadata
      "trace_version": "test-trace-version",        # set langfuse Trace Version (if not set, defaults to Generation Version)
@ -147,9 +149,10 @@ print(response)
 You can also pass `metadata` as part of the request header with a `langfuse_*` prefix:

 ```shell
-curl --location 'http://0.0.0.0:4000/chat/completions' \
-    --header 'Content-Type: application/json' \    
-    --header 'langfuse_trace_id: trace-id22' \
+curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Content-Type: application/json' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'langfuse_trace_id: trace-id2' \
    --header 'langfuse_trace_user_id: user-id2' \
    --header 'langfuse_trace_metadata: {"key":"value"}' \
    --data '{
@ -190,9 +193,10 @@ The following parameters can be updated on a continuation of a trace by passing

 #### Generation Specific Parameters

-* `generation_id`   - Identifier for the generation, auto-generated by default
-* `generation_name` - Identifier for the generation, auto-generated by default
-* `prompt`          - Langfuse prompt object used for the generation, defaults to None
+* `generation_id`         - Identifier for the generation, auto-generated by default
+* `generation_name`       - Identifier for the generation, auto-generated by default
+* `parent_observation_id` - Identifier for the parent observation, defaults to `None`
+* `prompt`                - Langfuse prompt object used for the generation, defaults to `None`

 Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.

--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -1,6 +1,16 @@
 import Image from '@theme/IdealImage';

 # Langsmith - Logging LLM Input/Output
+
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 An all-in-one developer platform for every step of the application lifecycle
 https://smith.langchain.com/

--- a/docs/my-website/docs/observability/logfire_integration.md
+++ b/docs/my-website/docs/observability/logfire_integration.md
@ -1,6 +1,6 @@
 import Image from '@theme/IdealImage';

-# Logfire - Logging LLM Input/Output
+# 🔥 Logfire - Logging LLM Input/Output

 Logfire is open Source Observability & Analytics for LLM Apps
 Detailed production traces and a granular view on quality, cost and latency
@ -14,10 +14,14 @@ join our [discord](https://discord.gg/wuPM9dRgDw)

 ## Pre-Requisites

-Ensure you have run `pip install logfire` for this integration
+Ensure you have installed the following packages to use this integration

 ```shell
-pip install logfire litellm
+pip install litellm
+
+pip install opentelemetry-api==1.25.0
+pip install opentelemetry-sdk==1.25.0
+pip install opentelemetry-exporter-otlp==1.25.0
 ```

 ## Quick Start
@ -25,8 +29,7 @@ pip install logfire litellm
 Get your Logfire token from [Logfire](https://logfire.pydantic.dev/)

 ```python
-litellm.success_callback = ["logfire"]
-litellm.failure_callback = ["logfire"] # logs errors to logfire
+litellm.callbacks = ["logfire"]
 ```

 ```python
--- a/docs/my-website/docs/observability/lunary_integration.md
+++ b/docs/my-website/docs/observability/lunary_integration.md
@ -1,5 +1,13 @@
 # Lunary - Logging and tracing LLM input/output

+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 [Lunary](https://lunary.ai/) is an open-source AI developer platform providing observability, prompt management, and evaluation tools for AI developers.

 <video controls width='900' >
--- a/docs/my-website/docs/observability/promptlayer_integration.md
+++ b/docs/my-website/docs/observability/promptlayer_integration.md
@ -1,5 +1,16 @@
+import Image from '@theme/IdealImage';
+
 # Promptlayer Tutorial

+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 Promptlayer is a platform for prompt engineers. Log OpenAI requests. Search usage history. Track performance. Visually manage prompt templates.

 <Image img={require('../../img/promptlayer.png')} />
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -1,5 +1,14 @@
 import Image from '@theme/IdealImage';

+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 # Sentry - Log LLM Exceptions
 [Sentry](https://sentry.io/) provides error monitoring for production. LiteLLM can add breadcrumbs and send exceptions to Sentry with this integration

--- a/docs/my-website/docs/observability/supabase_integration.md
+++ b/docs/my-website/docs/observability/supabase_integration.md
@ -1,4 +1,12 @@
 # Supabase Tutorial 
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
 [Supabase](https://supabase.com/) is an open source Firebase alternative.
 Start your project with a Postgres database, Authentication, instant APIs, Edge Functions, Realtime subscriptions, Storage, and Vector embeddings.

--- a/docs/my-website/docs/observability/wandb_integration.md
+++ b/docs/my-website/docs/observability/wandb_integration.md
@ -1,6 +1,16 @@
 import Image from '@theme/IdealImage';

 # Weights & Biases - Logging LLM Input/Output
+
+
+:::tip
+
+This is community maintained, Please make an issue if you run into a bug
+https://github.com/BerriAI/litellm
+
+:::
+
+
 Weights & Biases helps AI developers build better models faster https://wandb.ai

 <Image img={require('../../img/wandb.png')} />
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -4,6 +4,7 @@ import TabItem from '@theme/TabItem';
 # Anthropic
 LiteLLM supports

+- `claude-3.5`
 - `claude-3` (`claude-3-haiku-20240307`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`)
 - `claude-2`
 - `claude-2.1`
@ -171,6 +172,7 @@ print(response)
 |------------------|--------------------------------------------|
 | claude-3-haiku  | `completion('claude-3-haiku-20240307', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-3-opus  | `completion('claude-3-opus-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
+| claude-3-5-sonnet  | `completion('claude-3-5-sonnet-20240620', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-3-sonnet  | `completion('claude-3-sonnet-20240229', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-2.1  | `completion('claude-2.1', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
 | claude-2  | `completion('claude-2', messages)` | `os.environ['ANTHROPIC_API_KEY']`       |
--- a/docs/my-website/docs/providers/azure.md
+++ b/docs/my-website/docs/providers/azure.md
@ -68,6 +68,7 @@ response = litellm.completion(

 | Model Name       | Function Call                          |
 |------------------|----------------------------------------|
+| gpt-4o            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4            | `completion('azure/<your deployment name>', messages)`         |
 | gpt-4-0314            | `completion('azure/<your deployment name>', messages)`         | 
 | gpt-4-0613            | `completion('azure/<your deployment name>', messages)`         |
@ -85,7 +86,8 @@ response = litellm.completion(
 ## Azure OpenAI Vision Models 
 | Model Name            | Function Call                                                   |
 |-----------------------|-----------------------------------------------------------------|
-| gpt-4-vision   | `response = completion(model="azure/<your deployment name>", messages=messages)` |
+| gpt-4-vision   | `completion(model="azure/<your deployment name>", messages=messages)` |
+| gpt-4o            | `completion('azure/<your deployment name>', messages)`         |

 #### Usage
 ```python
--- a/docs/my-website/docs/providers/bedrock.md
+++ b/docs/my-website/docs/providers/bedrock.md
@ -623,6 +623,7 @@ Here's an example of using a bedrock model with LiteLLM

 | Model Name                 | Command                                                          |
 |----------------------------|------------------------------------------------------------------|
+| Anthropic Claude-V3.5 Sonnet    | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3  sonnet    | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3 Haiku     | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
 | Anthropic Claude-V3 Opus     | `completion(model='bedrock/anthropic.claude-3-opus-20240229-v1:0', messages=messages)`   | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`           |
--- a/docs/my-website/docs/providers/codestral.md
+++ b/docs/my-website/docs/providers/codestral.md
@ -0,0 +1,255 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Codestral API [Mistral AI]
+
+Codestral is available in select code-completion plugins but can also be queried directly. See the documentation for more details.
+
+## API Key
+```python
+# env variable
+os.environ['CODESTRAL_API_KEY']
+```
+
+## FIM / Completions
+
+:::info
+
+Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createFIMCompletion
+
+:::
+
+
+<Tabs>
+<TabItem value="no-streaming" label="No Streaming">
+
+#### Sample Usage
+
+```python
+import os
+import litellm
+
+os.environ['CODESTRAL_API_KEY']
+
+response = await litellm.atext_completion(
+    model="text-completion-codestral/codestral-2405",
+    prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():", 
+    suffix="return True",                                              # optional
+    temperature=0,                                                     # optional
+    top_p=1,                                                           # optional
+    max_tokens=10,                                                     # optional
+    min_tokens=10,                                                     # optional
+    seed=10,                                                           # optional
+    stop=["return"],                                                   # optional
+)
+```
+
+#### Expected Response
+
+```json
+{
+  "id": "b41e0df599f94bc1a46ea9fcdbc2aabe",
+  "object": "text_completion",
+  "created": 1589478378,
+  "model": "codestral-latest",
+  "choices": [
+    {
+      "text": "\n assert is_odd(1)\n assert",
+      "index": 0,
+      "logprobs": null,
+      "finish_reason": "length"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "completion_tokens": 7,
+    "total_tokens": 12
+  }
+}
+
+```
+
+
+</TabItem>
+<TabItem value="stream" label="Streaming">
+
+#### Sample Usage - Streaming
+
+```python
+import os
+import litellm
+
+os.environ['CODESTRAL_API_KEY']
+
+response = await litellm.atext_completion(
+    model="text-completion-codestral/codestral-2405",
+    prompt="def is_odd(n): \n return n % 2 == 1 \ndef test_is_odd():",
+    suffix="return True",    # optional
+    temperature=0,           # optional
+    top_p=1,                 # optional
+    stream=True,                
+    seed=10,                 # optional
+    stop=["return"],         # optional
+)
+
+async for chunk in response:
+    print(chunk)
+```
+
+#### Expected Response
+
+```json
+{
+  "id": "726025d3e2d645d09d475bb0d29e3640",
+  "object": "text_completion",
+  "created": 1718659669,
+  "choices": [
+    {
+      "text": "This",
+      "index": 0,
+      "logprobs": null,
+      "finish_reason": null
+    }
+  ],
+  "model": "codestral-2405", 
+}
+
+```
+</TabItem>
+</Tabs>
+
+### Supported Models
+All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
+
+| Model Name     | Function Call                                                |
+|----------------|--------------------------------------------------------------|
+| Codestral Latest  | `completion(model="text-completion-codestral/codestral-latest", messages)` |
+| Codestral 2405 | `completion(model="text-completion-codestral/codestral-2405", messages)`|
+
+
+
+
+## Chat Completions
+
+:::info
+
+Official Mistral API Docs: https://docs.mistral.ai/api/#operation/createChatCompletion
+:::
+
+
+<Tabs>
+<TabItem value="no-streaming" label="No Streaming">
+
+#### Sample Usage
+
+```python
+import os
+import litellm
+
+os.environ['CODESTRAL_API_KEY']
+
+response = await litellm.acompletion(
+    model="codestral/codestral-latest",
+    messages=[
+        {
+            "role": "user",
+            "content": "Hey, how's it going?",
+        }
+    ],
+    temperature=0.0,       # optional
+    top_p=1,               # optional
+    max_tokens=10,         # optional
+    safe_prompt=False,     # optional
+    seed=12,               # optional
+)
+```
+
+#### Expected Response
+
+```json
+{
+  "id": "chatcmpl-123",
+  "object": "chat.completion",
+  "created": 1677652288,
+  "model": "codestral/codestral-latest",
+  "system_fingerprint": None,
+  "choices": [{
+    "index": 0,
+    "message": {
+      "role": "assistant",
+      "content": "\n\nHello there, how may I assist you today?",
+    },
+    "logprobs": null,
+    "finish_reason": "stop"
+  }],
+  "usage": {
+    "prompt_tokens": 9,
+    "completion_tokens": 12,
+    "total_tokens": 21
+  }
+}
+
+
+```
+
+
+</TabItem>
+<TabItem value="stream" label="Streaming">
+
+#### Sample Usage - Streaming
+
+```python
+import os
+import litellm
+
+os.environ['CODESTRAL_API_KEY']
+
+response = await litellm.acompletion(
+    model="codestral/codestral-latest",
+    messages=[
+        {
+            "role": "user",
+            "content": "Hey, how's it going?",
+        }
+    ],
+    stream=True,           # optional
+    temperature=0.0,       # optional
+    top_p=1,               # optional
+    max_tokens=10,         # optional
+    safe_prompt=False,     # optional
+    seed=12,               # optional
+)
+async for chunk in response:
+    print(chunk)
+```
+
+#### Expected Response
+
+```json
+{
+    "id":"chatcmpl-123",
+    "object":"chat.completion.chunk",
+    "created":1694268190,
+    "model": "codestral/codestral-latest",
+    "system_fingerprint": None, 
+    "choices":[
+        {
+            "index":0,
+            "delta":{"role":"assistant","content":"gm"},
+            "logprobs":null,
+        "   finish_reason":null
+        }
+    ]
+}
+
+```
+</TabItem>
+</Tabs>
+
+### Supported Models
+All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
+
+| Model Name     | Function Call                                                |
+|----------------|--------------------------------------------------------------|
+| Codestral Latest  | `completion(model="codestral/codestral-latest", messages)` |
+| Codestral 2405 | `completion(model="codestral/codestral-2405", messages)`|
--- a/docs/my-website/docs/providers/deepinfra.md
+++ b/docs/my-website/docs/providers/deepinfra.md
@ -1,6 +1,13 @@
 # DeepInfra
 https://deepinfra.com/

+:::tip
+
+**We support ALL DeepInfra models, just set `model=deepinfra/<any-model-on-deepinfra>` as a prefix when sending litellm requests**
+
+:::
+
+
 ## API Key
 ```python
 # env variable
@ -38,13 +45,11 @@ for chunk in response:
 ## Chat Models
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
+| meta-llama/Meta-Llama-3-8B-Instruct  | `completion(model="deepinfra/meta-llama/Meta-Llama-3-8B-Instruct", messages)` | 
+| meta-llama/Meta-Llama-3-70B-Instruct  | `completion(model="deepinfra/meta-llama/Meta-Llama-3-70B-Instruct", messages)` | 
 | meta-llama/Llama-2-70b-chat-hf  | `completion(model="deepinfra/meta-llama/Llama-2-70b-chat-hf", messages)` | 
 | meta-llama/Llama-2-7b-chat-hf  | `completion(model="deepinfra/meta-llama/Llama-2-7b-chat-hf", messages)` | 
 | meta-llama/Llama-2-13b-chat-hf | `completion(model="deepinfra/meta-llama/Llama-2-13b-chat-hf", messages)` | 
 | codellama/CodeLlama-34b-Instruct-hf | `completion(model="deepinfra/codellama/CodeLlama-34b-Instruct-hf", messages)` |
 | mistralai/Mistral-7B-Instruct-v0.1 | `completion(model="deepinfra/mistralai/Mistral-7B-Instruct-v0.1", messages)` | 
 | jondurbin/airoboros-l2-70b-gpt4-1.4.1 | `completion(model="deepinfra/jondurbin/airoboros-l2-70b-gpt4-1.4.1", messages)` |
-
-
-
-
--- a/docs/my-website/docs/providers/deepseek.md
+++ b/docs/my-website/docs/providers/deepseek.md
@ -49,6 +49,6 @@ We support ALL Deepseek models, just set `deepseek/` as a prefix when sending co
 | Model Name               | Function Call                                                                                                                                                      |
 |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` | 
-| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` | 
+| deepseek-coder | `completion(model="deepseek/deepseek-coder", messages)` | 


--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -45,6 +45,52 @@ response = completion(
 )
 ```

+## Tool Calling 
+
+```python
+from litellm import completion
+import os
+# set env
+os.environ["GEMINI_API_KEY"] = ".."
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+response = completion(
+    model="gemini/gemini-1.5-flash",
+    messages=messages,
+    tools=tools,
+)
+# Add any assertions, here to check response args
+print(response)
+assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+assert isinstance(
+    response.choices[0].message.tool_calls[0].function.arguments, str
+)
+
+
+```
+
+
 # Gemini-Pro-Vision
 LiteLLM Supports the following image types passed in `url`
 - Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg
--- a/docs/my-website/docs/providers/groq.md
+++ b/docs/my-website/docs/providers/groq.md
@ -1,7 +1,11 @@
 # Groq
 https://groq.com/

-**We support ALL Groq models, just set `groq/` as a prefix when sending completion requests**
+:::tip
+
+**We support ALL Groq models, just set `model=groq/<any-model-on-groq>` as a prefix when sending litellm requests**
+
+:::

 ## API Key
 ```python
--- a/docs/my-website/docs/providers/openai.md
+++ b/docs/my-website/docs/providers/openai.md
@ -223,6 +223,17 @@ response = completion(

 ```

+## OpenAI Fine Tuned Models
+
+| Model Name                | Function Call                                                          |
+|---------------------------|-----------------------------------------------------------------|
+| fine tuned `gpt-4-0613`    | `response = completion(model="ft:gpt-4-0613", messages=messages)`     |
+| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` |
+| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` |
+| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` |
+| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` |
+
+
 ## Advanced

 ### Parallel Function calling
--- a/docs/my-website/docs/providers/text_completion_openai.md
+++ b/docs/my-website/docs/providers/text_completion_openai.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # OpenAI (Text Completion)

 LiteLLM supports OpenAI text completion models
--- a/docs/my-website/docs/providers/togetherai.md
+++ b/docs/my-website/docs/providers/togetherai.md
@ -208,7 +208,7 @@ print(response)

 Instead of using the `custom_llm_provider` arg to specify which provider you're using (e.g. together ai), you can just pass the provider name as part of the model name, and LiteLLM will parse it out. 

-Expected format: <custom_llm_provider>/<model_name>
+Expected format: `<custom_llm_provider>/<model_name>`

 e.g. completion(model="together_ai/togethercomputer/Llama-2-7B-32K-Instruct", ...)

--- a/docs/my-website/docs/providers/vertex.md
+++ b/docs/my-website/docs/providers/vertex.md
@ -8,6 +8,152 @@ import TabItem from '@theme/TabItem';
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

+## 🆕 `vertex_ai_beta/` route 
+
+New `vertex_ai_beta/` route. Adds support for system messages, tool_choice params, etc. by moving to httpx client (instead of vertex sdk).
+
+```python
+from litellm import completion
+import json 
+
+## GET CREDENTIALS 
+file_path = 'path/to/vertex_ai_service_account.json'
+
+# Load the JSON file
+with open(file_path, 'r') as file:
+    vertex_credentials = json.load(file)
+
+# Convert to JSON string
+vertex_credentials_json = json.dumps(vertex_credentials)
+
+## COMPLETION CALL 
+response = completion(
+  model="vertex_ai_beta/gemini-pro",
+  messages=[{ "content": "Hello, how are you?","role": "user"}],
+  vertex_credentials=vertex_credentials_json
+)
+```
+
+### **System Message**
+
+```python
+from litellm import completion
+import json 
+
+## GET CREDENTIALS 
+file_path = 'path/to/vertex_ai_service_account.json'
+
+# Load the JSON file
+with open(file_path, 'r') as file:
+    vertex_credentials = json.load(file)
+
+# Convert to JSON string
+vertex_credentials_json = json.dumps(vertex_credentials)
+
+
+response = completion(
+  model="vertex_ai_beta/gemini-pro",
+  messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}], 
+  vertex_credentials=vertex_credentials_json
+)
+```
+
+### **Function Calling**
+
+Force Gemini to make tool calls with `tool_choice="required"`.
+
+```python
+from litellm import completion
+import json 
+
+## GET CREDENTIALS 
+file_path = 'path/to/vertex_ai_service_account.json'
+
+# Load the JSON file
+with open(file_path, 'r') as file:
+    vertex_credentials = json.load(file)
+
+# Convert to JSON string
+vertex_credentials_json = json.dumps(vertex_credentials)
+
+
+messages = [
+    {
+        "role": "system",
+        "content": "Your name is Litellm Bot, you are a helpful assistant",
+    },
+    # User asks for their name and weather in San Francisco
+    {
+        "role": "user",
+        "content": "Hello, what is your name and can you tell me the weather?",
+    },
+]
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    }
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+
+data = {
+    "model": "vertex_ai_beta/gemini-1.5-pro-preview-0514"),
+    "messages": messages,
+    "tools": tools,
+    "tool_choice": "required",
+    "vertex_credentials": vertex_credentials_json
+}
+
+## COMPLETION CALL 
+print(completion(**data))
+```
+
+### **JSON Schema**
+
+```python 
+from litellm import completion 
+
+## GET CREDENTIALS 
+file_path = 'path/to/vertex_ai_service_account.json'
+
+# Load the JSON file
+with open(file_path, 'r') as file:
+    vertex_credentials = json.load(file)
+
+# Convert to JSON string
+vertex_credentials_json = json.dumps(vertex_credentials)
+
+messages = [
+    {
+        "role": "user",
+        "content": """
+List 5 popular cookie recipes.
+
+Using this JSON schema:
+
+    Recipe = {"recipe_name": str}
+
+Return a `list[Recipe]`
+        """
+    }
+]
+
+completion(model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" })
+```
+
 ## Pre-requisites
 * `pip install google-cloud-aiplatform` (pre-installed on proxy docker image)
 * Authentication: 
@ -140,7 +286,7 @@ In certain use-cases you may need to make calls to the models and pass [safety s

 ```python
 response = completion(
-    model="gemini/gemini-pro", 
+    model="vertex_ai/gemini-pro", 
    messages=[{"role": "user", "content": "write code for saying hi from LiteLLM"}]
    safety_settings=[
        {
@ -254,6 +400,7 @@ litellm.vertex_location = "us-central1 # Your Location
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
 | claude-3-opus@20240229   | `completion('vertex_ai/claude-3-opus@20240229', messages)` |
+| claude-3-5-sonnet@20240620  | `completion('vertex_ai/claude-3-5-sonnet@20240620', messages)` |
 | claude-3-sonnet@20240229   | `completion('vertex_ai/claude-3-sonnet@20240229', messages)` |
 | claude-3-haiku@20240307   | `completion('vertex_ai/claude-3-haiku@20240307', messages)` |

@ -363,8 +510,8 @@ response = completion(
 ## Gemini 1.5 Pro (and Vision)
 | Model Name       | Function Call                        |
 |------------------|--------------------------------------|
-| gemini-1.5-pro   | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
-| gemini-1.5-flash-preview-0514   | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-pro', messages)` |
+| gemini-1.5-pro   | `completion('gemini-1.5-pro', messages)`, `completion('vertex_ai/gemini-1.5-pro', messages)` |
+| gemini-1.5-flash-preview-0514   | `completion('gemini-1.5-flash-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-flash-preview-0514', messages)` |
 | gemini-1.5-pro-preview-0514   | `completion('gemini-1.5-pro-preview-0514', messages)`, `completion('vertex_ai/gemini-1.5-pro-preview-0514', messages)` |


@ -680,6 +827,3 @@ s/o @[Darien Kindlund](https://www.linkedin.com/in/kindlund/) for this tutorial



-
-
-
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,3 +1,5 @@
+import Image from '@theme/IdealImage';
+
 # 🚨 Alerting / Webhooks

 Get alerts for:
@ -15,6 +17,11 @@ Get alerts for:
 - **Spend** Weekly & Monthly spend per Team, Tag


+Works across: 
+- [Slack](#quick-start)
+- [Discord](#advanced---using-discord-webhooks)
+- [Microsoft Teams](#advanced---using-ms-teams-webhooks)
+
 ## Quick Start

 Set up a slack alert channel to receive alerts from proxy.
@ -25,41 +32,33 @@ Get a slack webhook url from https://api.slack.com/messaging/webhooks

 You can also use Discord Webhooks, see [here](#using-discord-webhooks)

-### Step 2: Update config.yaml 

- Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
- Just for testing purposes, let's save a bad key to our proxy.
+Set `SLACK_WEBHOOK_URL` in your proxy env to enable Slack alerts.
+
+```bash
+export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
+```
+
+### Step 2: Setup Proxy

 ```yaml
-model_list: 
-    model_name: "azure-model"
-    litellm_params:
-        model: "azure/gpt-35-turbo"
-        api_key: "my-bad-key" # 👈 bad key
-
 general_settings: 
    alerting: ["slack"]
    alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ 
-
-environment_variables:
-    SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/<>/<>/<>"
-    SLACK_DAILY_REPORT_FREQUENCY: "86400"  # 24 hours; Optional: defaults to 12 hours
 ```

-
-### Step 3: Start proxy
-
+Start proxy 
 ```bash
 $ litellm --config /path/to/config.yaml
 ```

-## Testing Alerting is Setup Correctly

-Make a GET request to `/health/services`, expect to see a test slack alert in your provided webhook slack channel
+### Step 3: Test it!

-```shell
-curl -X GET 'http://localhost:4000/health/services?service=slack' \
-  -H 'Authorization: Bearer sk-1234'
+
+```bash
+curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \
+-H 'Authorization: Bearer sk-1234'
 ```

 ## Advanced - Redacting Messages from Alerts
@ -77,7 +76,34 @@ litellm_settings:
 ```


+## Advanced - Add Metadata to alerts 

+Add alerting metadata to proxy calls for debugging. 
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(
+    model="gpt-3.5-turbo",
+    messages = [], 
+    extra_body={
+        "metadata": {
+            "alerting_metadata": {
+                "hello": "world"
+            }
+        }
+    }
+)
+```
+
+**Expected Response**
+
+<Image img={require('../../img/alerting_metadata.png')}/>

 ## Advanced - Opting into specific alert types

@ -108,6 +134,48 @@ AlertType = Literal[
 ```


+## Advanced - Using MS Teams Webhooks
+
+MS Teams provides a slack compatible webhook url that you can use for alerting
+
+##### Quick Start
+
+1. [Get a webhook url](https://learn.microsoft.com/en-us/microsoftteams/platform/webhooks-and-connectors/how-to/add-incoming-webhook?tabs=newteams%2Cdotnet#create-an-incoming-webhook) for your Microsoft Teams channel 
+
+2. Add it to your .env
+
+```bash
+SLACK_WEBHOOK_URL="https://berriai.webhook.office.com/webhookb2/...6901/IncomingWebhook/b55fa0c2a48647be8e6effedcd540266/e04b1092-4a3e-44a2-ab6b-29a0a4854d1d"
+```
+
+3. Add it to your litellm config 
+
+```yaml
+model_list: 
+    model_name: "azure-model"
+    litellm_params:
+        model: "azure/gpt-35-turbo"
+        api_key: "my-bad-key" # 👈 bad key
+
+general_settings: 
+    alerting: ["slack"]
+    alerting_threshold: 300 # sends alerts if requests hang for 5min+ and responses take 5min+ 
+```
+
+4. Run health check!
+
+Call the proxy `/health/services` endpoint to test if your alerting connection is correctly setup.
+
+```bash
+curl --location 'http://0.0.0.0:4000/health/services?service=slack' \
+--header 'Authorization: Bearer sk-1234'
+```
+
+
+**Expected Response**
+
+<Image img={require('../../img/ms_teams_alerting.png')}/>
+
 ## Advanced - Using Discord Webhooks

 Discord provides a slack compatible webhook url that you can use for alerting
@ -139,7 +207,6 @@ environment_variables:
    SLACK_WEBHOOK_URL: "https://discord.com/api/webhooks/1240030362193760286/cTLWt5ATn1gKmcy_982rl5xmYHsrM1IWJdmCL1AyOmU9JdQXazrp8L1_PYgUtgxj8x4f/slack"
 ```

-That's it ! You're ready to go !

 ## Advanced - [BETA] Webhooks for Budget Alerts

--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -252,6 +252,31 @@ $ litellm --config /path/to/config.yaml
 ```


+## Multiple OpenAI Organizations 
+
+Add all openai models across all OpenAI organizations with just 1 model definition 
+
+```yaml
+  - model_name: *
+    litellm_params:
+      model: openai/*
+      api_key: os.environ/OPENAI_API_KEY
+      organization:
+       - org-1 
+       - org-2 
+       - org-3
+```
+
+LiteLLM will automatically create separate deployments for each org.
+
+Confirm this via 
+
+```bash
+curl --location 'http://0.0.0.0:4000/v1/model/info' \
+--header 'Authorization: Bearer ${LITELLM_KEY}' \
+--data ''
+```
+
 ## Load Balancing 

 :::info
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -27,7 +27,7 @@ docker-compose up

 <Tabs>

-<TabItem value="basic" label="Basic">
+<TabItem value="basic" label="Basic (No DB)">

 ### Step 1. CREATE config.yaml 

@ -98,7 +98,13 @@ docker run ghcr.io/berriai/litellm:main-latest --port 8002 --num_workers 8
 ```

 </TabItem>
+<TabItem value="terraform" label="Terraform">

+s/o [Nicholas Cecere](https://www.linkedin.com/in/nicholas-cecere-24243549/) for his LiteLLM User Management Terraform
+
+👉 [Go here for Terraform](https://github.com/ncecere/terraform-litellm-user-mgmt)
+
+</TabItem>
 <TabItem value="base-image" label="use litellm as a base image">

 ```shell
@ -380,6 +386,7 @@ kubectl port-forward service/litellm-service 4000:4000
 Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.

 </TabItem>
+
 <TabItem value="helm-deploy" label="Helm">


@ -425,7 +432,6 @@ If you need to set your litellm proxy config.yaml, you can find this in [values.

 </TabItem>

-
 <TabItem value="helm-oci" label="Helm OCI Registry (GHCR)">

 :::info
@ -669,7 +675,7 @@ Once the stack is created, get the DatabaseURL of the Database resource, copy th
 #### 3. Connect to the EC2 Instance and deploy litellm on the EC2 container
 From the EC2 console, connect to the instance created by the stack (e.g., using SSH).

-Run the following command, replacing <database_url> with the value you copied in step 2
+Run the following command, replacing `<database_url>` with the value you copied in step 2

 ```shell
 docker run --name litellm-proxy \
--- a/docs/my-website/docs/proxy/email.md
+++ b/docs/my-website/docs/proxy/email.md
@ -5,6 +5,7 @@ import Image from '@theme/IdealImage';
 Send an Email to your users when:
 - A Proxy API Key is created for them 
 - Their API Key crosses it's Budget 
+- All Team members of a LiteLLM Team -> when the team crosses it's budget

 <Image img={require('../../img/email_notifs.png')} style={{ width: '500px' }}/>

--- a/docs/my-website/docs/proxy/model_management.md
+++ b/docs/my-website/docs/proxy/model_management.md
@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # Model Management
 Add new models + Get model info without restarting proxy.

--- a/docs/my-website/docs/proxy/perf.md
+++ b/docs/my-website/docs/proxy/perf.md
@ -1,3 +1,5 @@
+import Image from '@theme/IdealImage';
+
 # LiteLLM Proxy Performance

 ### Throughput - 30% Increase
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -1,4 +1,4 @@
-# Grafana, Prometheus metrics [BETA]
+# 📈 Prometheus metrics [BETA]

 LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll

@ -54,6 +54,13 @@ http://localhost:4000/metrics
 | `litellm_total_tokens`         | input + output tokens per `"user", "key", "model", "team", "end-user"`     |
 | `litellm_llm_api_failed_requests_metric`   | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"`    |

+### Budget Metrics
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_remaining_team_budget_metric`             | Remaining Budget for Team (A team created on LiteLLM) |
+| `litellm_remaining_api_key_budget_metric`                | Remaining Budget for API Key (A key Created on LiteLLM)|
+
+
 ## Monitor System Health

 To monitor the health of litellm adjacent services (redis / postgres), do:
--- a/docs/my-website/docs/proxy/reliability.md
+++ b/docs/my-website/docs/proxy/reliability.md
@ -409,6 +409,28 @@ print(response)
 </Tabs>


+### Content Policy Fallbacks
+
+Fallback across providers (e.g. from Azure OpenAI to Anthropic) if you hit content policy violation errors. 
+
+```yaml
+model_list:
+	- model_name: gpt-3.5-turbo-small
+	  litellm_params:
+		model: azure/chatgpt-v-2
+        api_base: os.environ/AZURE_API_BASE
+        api_key: os.environ/AZURE_API_KEY
+        api_version: "2023-07-01-preview"
+
+    - model_name: claude-opus
+      litellm_params:
+        model: claude-3-opus-20240229
+        api_key: os.environ/ANTHROPIC_API_KEY
+
+litellm_settings:
+  content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}]
+```
+
 ### EU-Region Filtering (Pre-Call Checks)

 **Before call is made** check if a call is within model context window with  **`enable_pre_call_checks: true`**.
--- a/docs/my-website/docs/proxy/self_serve.md
+++ b/docs/my-website/docs/proxy/self_serve.md
@ -123,4 +123,18 @@ LiteLLM Enterprise: Enable [SSO login](./ui.md#setup-ssoauth-for-ui)
 4. User can now create their own keys


-<Image img={require('../../img/ui_self_serve_create_key.png')}  style={{ width: '800px', height: 'auto' }} />
+<Image img={require('../../img/ui_self_serve_create_key.png')}  style={{ width: '800px', height: 'auto' }} />
+
+
+## Advanced
+### Setting custom logout URLs
+
+Set `PROXY_LOGOUT_URL` in your .env if you want users to get redirected to a specific URL when they click logout
+
+```
+export PROXY_LOGOUT_URL="https://www.google.com"
+```
+
+<Image img={require('../../img/ui_logout.png')}  style={{ width: '400px', height: 'auto' }} />
+
+
--- a/docs/my-website/docs/proxy/team_budgets.md
+++ b/docs/my-website/docs/proxy/team_budgets.md
@ -0,0 +1,154 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# 💰 Setting Team Budgets
+
+Track spend, set budgets for your Internal Team
+
+## Setting Monthly Team Budgets
+
+### 1. Create a team 
+- Set `max_budget=000000001` ($ value the team is allowed to spend)
+- Set `budget_duration="1d"` (How frequently the budget should update)
+
+<Tabs>
+
+<TabItem value="API" label="API">
+
+Create a new team and set `max_budget` and `budget_duration`
+```shell
+curl -X POST 'http://0.0.0.0:4000/team/new' \
+     -H 'Authorization: Bearer sk-1234' \
+     -H 'Content-Type: application/json' \
+     -d '{
+            "team_alias": "QA Prod Bot", 
+            "max_budget": 0.000000001, 
+            "budget_duration": "1d"
+        }' 
+```
+
+Response
+```shell
+{
+ "team_alias": "QA Prod Bot",
+ "team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a",
+ "max_budget": 0.0001,
+ "budget_duration": "1d",
+ "budget_reset_at": "2024-06-14T22:48:36.594000Z"
+}  
+```
+</TabItem>
+
+<TabItem value="UI" label="Admin UI">
+<Image img={require('../../img/create_team_gif_good.gif')} />
+
+</TabItem>
+
+
+</Tabs>
+
+Possible values for `budget_duration`
+
+| `budget_duration` | When Budget will reset |
+| --- | --- |
+| `budget_duration="1s"` | every 1 second |
+| `budget_duration="1m"` | every 1 min |
+| `budget_duration="1h"` | every 1 hour |
+| `budget_duration="1d"` | every 1 day |
+| `budget_duration="1mo"` | every 1 month |
+
+
+### 2. Create a key for the `team`
+
+Create a key for Team=`QA Prod Bot` and `team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"` from Step 1 
+
+<Tabs>
+
+<TabItem value="api" label="API">
+
+💡 **The Budget for Team="QA Prod Bot" budget will apply to this team**
+
+```shell
+curl -X POST 'http://0.0.0.0:4000/key/generate' \
+     -H 'Authorization: Bearer sk-1234' \
+     -H 'Content-Type: application/json' \
+     -d '{"team_id": "de35b29e-6ca8-4f47-b804-2b79d07aa99a"}'
+```
+
+Response
+
+```shell
+{"team_id":"de35b29e-6ca8-4f47-b804-2b79d07aa99a", "key":"sk-5qtncoYjzRcxMM4bDRktNQ"}
+```
+</TabItem>
+
+<TabItem value="UI" label="Admin UI">
+<Image img={require('../../img/create_key_in_team.gif')} />
+</TabItem>
+
+</Tabs>
+
+### 3. Test It
+
+Use the key from step 2 and run this Request twice
+<Tabs>
+
+<TabItem value="api" label="API">
+
+```shell
+curl -X POST 'http://0.0.0.0:4000/chat/completions' \
+     -H 'Authorization: Bearer sk-mso-JSykEGri86KyOvgxBw' \
+     -H 'Content-Type: application/json' \
+     -d ' {
+           "model": "llama3",
+           "messages": [
+             {
+               "role": "user",
+               "content": "hi"
+             }
+           ]
+         }'
+```
+
+On the 2nd response - expect to see the following exception
+
+```shell
+{
+ "error": {
+   "message": "Budget has been exceeded! Current cost: 3.5e-06, Max budget: 1e-09",
+   "type": "auth_error",
+   "param": null,
+   "code": 400
+ }
+}
+```
+
+</TabItem>
+
+<TabItem value="UI" label="Admin UI">
+<Image img={require('../../img/test_key_budget.gif')} />
+</TabItem>
+</Tabs>
+
+## Advanced
+
+### Prometheus metrics for `remaining_budget`
+
+[More info about Prometheus metrics here](https://docs.litellm.ai/docs/proxy/prometheus)
+
+You'll need the following in your proxy config.yaml
+
+```yaml
+litellm_settings:
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
+```
+
+Expect to see this metric on prometheus to track the Remaining Budget for the team
+
+```shell
+litellm_remaining_team_budget_metric{team_alias="QA Prod Bot",team_id="de35b29e-6ca8-4f47-b804-2b79d07aa99a"} 9.699999999999992e-06
+```
+
+
--- a/docs/my-website/docs/proxy/users.md
+++ b/docs/my-website/docs/proxy/users.md
@ -62,6 +62,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
 You can:
 - Add budgets to Teams

+:::info
+
+**Step-by step tutorial on setting, resetting budgets on Teams here (API or using Admin UI)**
+
+👉 [https://docs.litellm.ai/docs/proxy/team_budgets](https://docs.litellm.ai/docs/proxy/team_budgets)
+
+:::
+

 #### **Add budgets to teams**
 ```shell 
@ -413,6 +421,63 @@ curl 'http://0.0.0.0:4000/key/generate' \
 </TabItem>
 </Tabs>

+### Reset Budgets 
+
+Reset budgets across keys/internal users/teams/customers
+
+`budget_duration`: Budget is reset at the end of specified duration. If not set, budget is never reset. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
+
+<Tabs>
+<TabItem value="users" label="Internal Users">
+
+```bash
+curl 'http://0.0.0.0:4000/user/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "max_budget": 10,
+  "budget_duration": 10s, # 👈 KEY CHANGE
+}'
+```
+</TabItem>
+<TabItem value="keys" label="Keys">
+
+```bash
+curl 'http://0.0.0.0:4000/key/generate' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "max_budget": 10,
+  "budget_duration": 10s, # 👈 KEY CHANGE
+}'
+```
+
+</TabItem>
+<TabItem value="teams" label="Teams">
+
+```bash
+curl 'http://0.0.0.0:4000/team/new' \
+--header 'Authorization: Bearer <your-master-key>' \
+--header 'Content-Type: application/json' \
+--data-raw '{
+  "max_budget": 10,
+  "budget_duration": 10s, # 👈 KEY CHANGE
+}'
+```
+</TabItem>
+</Tabs>
+
+**Note:** By default, the server checks for resets every 10 minutes, to minimize DB calls.
+
+To change this, set `proxy_budget_rescheduler_min_time` and `proxy_budget_rescheduler_max_time`
+
+E.g.: Check every 1 seconds
+```yaml
+general_settings: 
+  proxy_budget_rescheduler_min_time: 1
+  proxy_budget_rescheduler_max_time: 1
+```
+
 ## Set Rate Limits 

 You can set: 
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,7 +95,7 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls

-## Advanced - Routing Strategies
+## Advanced - Routing Strategies ⭐️
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based

 Router provides 4 strategies for routing your calls across multiple deployments: 
@ -262,7 +262,7 @@ if response is not None:
 	)
 ```

-### Set Time Window 
+#### Set Time Window 

 Set time window for how far back to consider when averaging latency for a deployment. 

@ -278,7 +278,7 @@ router_settings:
 	routing_strategy_args: {"ttl": 10}
 ```

-### Set Lowest Latency Buffer
+#### Set Lowest Latency Buffer

 Set a buffer within which deployments are candidates for making calls to. 

@ -468,6 +468,122 @@ asyncio.run(router_acompletion())
 ```

 </TabItem>
+
+<TabItem value="custom" label="Custom Routing Strategy">
+
+**Plugin a custom routing strategy to select deployments**
+
+
+Step 1. Define your custom routing strategy
+
+```python
+
+from litellm.router import CustomRoutingStrategyBase
+class CustomRoutingStrategy(CustomRoutingStrategyBase):
+    async def async_get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+        request_kwargs: Optional[Dict] = None,
+    ):
+        """
+        Asynchronously retrieves the available deployment based on the given parameters.
+
+        Args:
+            model (str): The name of the model.
+            messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
+            input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
+            specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
+            request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
+
+        Returns:
+            Returns an element from litellm.router.model_list
+
+        """
+        print("In CUSTOM async get available deployment")
+        model_list = router.model_list
+        print("router model list=", model_list)
+        for model in model_list:
+            if isinstance(model, dict):
+                if model["litellm_params"]["model"] == "openai/very-special-endpoint":
+                    return model
+        pass
+
+    def get_available_deployment(
+        self,
+        model: str,
+        messages: Optional[List[Dict[str, str]]] = None,
+        input: Optional[Union[str, List]] = None,
+        specific_deployment: Optional[bool] = False,
+        request_kwargs: Optional[Dict] = None,
+    ):
+        """
+        Synchronously retrieves the available deployment based on the given parameters.
+
+        Args:
+            model (str): The name of the model.
+            messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
+            input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
+            specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
+            request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
+
+        Returns:
+            Returns an element from litellm.router.model_list
+
+        """
+        pass
+```
+
+Step 2. Initialize Router with custom routing strategy
+```python
+from litellm import Router
+
+router = Router(
+    model_list=[
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "openai/very-special-endpoint",
+                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",  # If you are Krrish, this is OpenAI Endpoint3 on our Railway endpoint :)
+                "api_key": "fake-key",
+            },
+            "model_info": {"id": "very-special-endpoint"},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "openai/fast-endpoint",
+                "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                "api_key": "fake-key",
+            },
+            "model_info": {"id": "fast-endpoint"},
+        },
+    ],
+    set_verbose=True,
+    debug_level="DEBUG",
+    timeout=1,
+)  # type: ignore
+
+router.set_custom_routing_strategy(CustomRoutingStrategy()) # 👈 Set your routing strategy here
+```
+
+Step 3. Test your routing strategy. Expect your custom routing strategy to be called when running `router.acompletion` requests
+```python
+for _ in range(10):
+	response = await router.acompletion(
+		model="azure-model", messages=[{"role": "user", "content": "hello"}]
+	)
+	print(response)
+	_picked_model_id = response._hidden_params["model_id"]
+	print("picked model=", _picked_model_id)
+```
+
+
+
+</TabItem>
+
 <TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">

 Picks a deployment based on the lowest cost
@ -563,7 +679,6 @@ asyncio.run(router_acompletion())
 ```

 </TabItem>
-
 </Tabs>

 ## Basic Reliability
@ -790,85 +905,205 @@ If the error is a context window exceeded error, fall back to a larger model gro

 Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.

-You can also set 'default_fallbacks', in case a specific model group is misconfigured / bad.
+You can also set `default_fallbacks`, in case a specific model group is misconfigured / bad.
+
+There are 3 types of fallbacks: 
+- `content_policy_fallbacks`: For litellm.ContentPolicyViolationError - LiteLLM maps content policy violation errors across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8495C27-L8495C54)
+- `context_window_fallbacks`: For litellm.ContextWindowExceededErrors - LiteLLM maps context window error messages across providers [**See Code**](https://github.com/BerriAI/litellm/blob/89a43c872a1e3084519fb9de159bf52f5447c6c4/litellm/utils.py#L8469)
+- `fallbacks`: For all remaining errors - e.g. litellm.RateLimitError
+
+**Content Policy Violation Fallback**
+
+Key change: 

 ```python
-from litellm import Router
-
-model_list = [
-    { # list of model deployments 
-		"model_name": "azure/gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-v-2", 
-			"api_key": "bad-key",
-			"api_version": os.getenv("AZURE_API_VERSION"),
-			"api_base": os.getenv("AZURE_API_BASE")
-		},
-		"tpm": 240000,
-		"rpm": 1800
-	}, 
-    { # list of model deployments 
-		"model_name": "azure/gpt-3.5-turbo-context-fallback", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-v-2", 
-			"api_key": "bad-key",
-			"api_version": os.getenv("AZURE_API_VERSION"),
-			"api_base": os.getenv("AZURE_API_BASE")
-		},
-		"tpm": 240000,
-		"rpm": 1800
-	}, 
-	{
-		"model_name": "azure/gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "azure/chatgpt-functioncalling", 
-			"api_key": "bad-key",
-			"api_version": os.getenv("AZURE_API_VERSION"),
-			"api_base": os.getenv("AZURE_API_BASE")
-		},
-		"tpm": 240000,
-		"rpm": 1800
-	}, 
-	{
-		"model_name": "gpt-3.5-turbo", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "gpt-3.5-turbo", 
-			"api_key": os.getenv("OPENAI_API_KEY"),
-		},
-		"tpm": 1000000,
-		"rpm": 9000
-	},
-    {
-		"model_name": "gpt-3.5-turbo-16k", # openai model name 
-		"litellm_params": { # params for litellm completion/embedding call 
-			"model": "gpt-3.5-turbo-16k", 
-			"api_key": os.getenv("OPENAI_API_KEY"),
-		},
-		"tpm": 1000000,
-		"rpm": 9000
-	}
-]
-
-
-router = Router(model_list=model_list, 
-                fallbacks=[{"azure/gpt-3.5-turbo": ["gpt-3.5-turbo"]}], 
-				default_fallbacks=["gpt-3.5-turbo-16k"],
-                context_window_fallbacks=[{"azure/gpt-3.5-turbo-context-fallback": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}],
-                set_verbose=True)
-
-
-user_message = "Hello, whats the weather in San Francisco??"
-messages = [{"content": user_message, "role": "user"}]
-
-# normal fallback call 
-response = router.completion(model="azure/gpt-3.5-turbo", messages=messages)
-
-# context window fallback call
-response = router.completion(model="azure/gpt-3.5-turbo-context-fallback", messages=messages)
-
-print(f"response: {response}")
+content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
 ```

+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import Router 
+
+router = Router(
+	model_list=[
+		{
+			"model_name": "claude-2",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": Exception("content filtering policy"),
+			},
+		},
+		{
+			"model_name": "my-fallback-model",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": "This works!",
+			},
+		},
+	],
+	content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
+	# fallbacks=[..], # [OPTIONAL]
+	# context_window_fallbacks=[..], # [OPTIONAL]
+)
+
+response = router.completion(
+	model="claude-2",
+	messages=[{"role": "user", "content": "Hey, how's it going?"}],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+In your proxy config.yaml just add this line 👇
+
+```yaml
+router_settings:
+	content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}]
+```
+
+Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+</TabItem>
+</Tabs>
+
+**Context Window Exceeded Fallback**
+
+Key change: 
+
+```python
+context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import Router 
+
+router = Router(
+	model_list=[
+		{
+			"model_name": "claude-2",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": Exception("prompt is too long"),
+			},
+		},
+		{
+			"model_name": "my-fallback-model",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": "This works!",
+			},
+		},
+	],
+	context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
+	# fallbacks=[..], # [OPTIONAL]
+	# content_policy_fallbacks=[..], # [OPTIONAL]
+)
+
+response = router.completion(
+	model="claude-2",
+	messages=[{"role": "user", "content": "Hey, how's it going?"}],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+In your proxy config.yaml just add this line 👇
+
+```yaml
+router_settings:
+	context_window_fallbacks=[{"claude-2": ["my-fallback-model"]}]
+```
+
+Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+</TabItem>
+</Tabs>
+
+**Regular Fallbacks**
+
+Key change: 
+
+```python
+fallbacks=[{"claude-2": ["my-fallback-model"]}]
+```
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import Router 
+
+router = Router(
+	model_list=[
+		{
+			"model_name": "claude-2",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": Exception("this is a rate limit error"),
+			},
+		},
+		{
+			"model_name": "my-fallback-model",
+			"litellm_params": {
+				"model": "claude-2",
+				"api_key": "",
+				"mock_response": "This works!",
+			},
+		},
+	],
+	fallbacks=[{"claude-2": ["my-fallback-model"]}], # 👈 KEY CHANGE
+	# context_window_fallbacks=[..], # [OPTIONAL]
+	# content_policy_fallbacks=[..], # [OPTIONAL]
+)
+
+response = router.completion(
+	model="claude-2",
+	messages=[{"role": "user", "content": "Hey, how's it going?"}],
+)
+```
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+In your proxy config.yaml just add this line 👇
+
+```yaml
+router_settings:
+	fallbacks=[{"claude-2": ["my-fallback-model"]}]
+```
+
+Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+</TabItem>
+</Tabs>
+
 ### Caching

 In production, we recommend using a Redis cache. For quickly testing things locally, we also support simple in-memory caching. 
--- a/docs/my-website/docs/tutorials/TogetherAI_liteLLM.md
+++ b/docs/my-website/docs/tutorials/TogetherAI_liteLLM.md
@ -23,9 +23,13 @@ https://api.together.xyz/playground/chat?model=togethercomputer%2Fllama-2-70b-ch
 model_name = "together_ai/togethercomputer/llama-2-70b-chat"
 response = completion(model=model_name, messages=messages)
 print(response)
+```
+
+
 ```

    {'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'role': 'assistant', 'content': "\n\nI'm not able to provide real-time weather information. However, I can suggest"}}], 'created': 1691629657.9288375, 'model': 'togethercomputer/llama-2-70b-chat', 'usage': {'prompt_tokens': 9, 'completion_tokens': 17, 'total_tokens': 26}}
+```


 LiteLLM handles the prompt formatting for Together AI's Llama2 models as well, converting your message to the 
--- a/docs/my-website/docusaurus.config.js
+++ b/docs/my-website/docusaurus.config.js
@ -38,9 +38,6 @@ const config = {
        disableInDev: false,
      },
    ],
-    [ require.resolve('docusaurus-lunr-search'), {
-      languages: ['en'] // language codes
-    }],
    () => ({
      name: 'cripchat',
      injectHtmlTags() {
@ -90,6 +87,15 @@ const config = {
    ({
      // Replace with your project's social card
      image: 'img/docusaurus-social-card.png',
+      algolia: {
+        // The application ID provided by Algolia
+        appId: 'NU85Y4NU0B',
+  
+        // Public API key: it is safe to commit it
+        apiKey: '4e0cf8c3020d0c876ad9174cea5c01fb',
+  
+        indexName: 'litellm',
+      },
      navbar: {
        title: '🚅 LiteLLM',
        items: [
@ -138,8 +144,8 @@ const config = {
            title: 'Docs',
            items: [
              {
-                label: 'Tutorial',
-                to: '/docs/index',
+                label: 'Getting Started',
+                to: 'https://docs.litellm.ai/docs/',
              },
            ],
          },
--- a/docs/my-website/img/alerting_metadata.png
+++ b/docs/my-website/img/alerting_metadata.png
--- a/docs/my-website/img/create_key_in_team.gif
+++ b/docs/my-website/img/create_key_in_team.gif
--- a/docs/my-website/img/create_team_gif_good.gif
+++ b/docs/my-website/img/create_team_gif_good.gif
--- a/docs/my-website/img/ms_teams_alerting.png
+++ b/docs/my-website/img/ms_teams_alerting.png
--- a/docs/my-website/img/test_key_budget.gif
+++ b/docs/my-website/img/test_key_budget.gif
--- a/docs/my-website/img/ui_logout.png
+++ b/docs/my-website/img/ui_logout.png
--- a/docs/my-website/package-lock.json
+++ b/docs/my-website/package-lock.json
--- a/docs/my-website/package.json
+++ b/docs/my-website/package.json
@ -23,8 +23,8 @@
    "docusaurus": "^1.14.7",
    "docusaurus-lunr-search": "^2.4.1",
    "prism-react-renderer": "^1.3.5",
-    "react": "^17.0.2",
-    "react-dom": "^17.0.2",
+    "react": "^18.1.0",
+    "react-dom": "^18.1.0",
    "sharp": "^0.32.6",
    "uuid": "^9.0.1"
  },
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -43,6 +43,7 @@ const sidebars = {
        "proxy/cost_tracking",
        "proxy/self_serve",
        "proxy/users",
+        "proxy/team_budgets",
        "proxy/customers",
        "proxy/billing",
        "proxy/user_keys",
@ -54,6 +55,7 @@ const sidebars = {
          items: ["proxy/logging", "proxy/streaming_logging"],
        },
        "proxy/ui",
+        "proxy/prometheus",
        "proxy/email",
        "proxy/multiple_admins",
        "proxy/team_based_routing",
@ -70,7 +72,6 @@ const sidebars = {
        "proxy/pii_masking",
        "proxy/prompt_injection",
        "proxy/caching",
-        "proxy/prometheus",
        "proxy/call_hooks",
        "proxy/rules",
        "proxy/cli", 
@ -87,6 +88,7 @@ const sidebars = {
      },
      items: [
        "completion/input",
+        "completion/drop_params",
        "completion/prompt_formatting",
        "completion/output",
        "exception_mapping",
@ -133,10 +135,11 @@ const sidebars = {
        "providers/vertex", 
        "providers/palm", 
        "providers/gemini", 
-        "providers/mistral", 
        "providers/anthropic", 
        "providers/aws_sagemaker",
        "providers/bedrock", 
+        "providers/mistral", 
+        "providers/codestral",
        "providers/cohere", 
        "providers/anyscale",
        "providers/huggingface", 
@ -170,10 +173,8 @@ const sidebars = {
    "proxy/custom_pricing",
    "routing",
    "scheduler",
-    "rules",
    "set_keys",
    "budget_manager",
-    "contributing",
    "secret",
    "completion/token_usage",
    "load_test",
@ -181,11 +182,11 @@ const sidebars = {
      type: "category",
      label: "Logging & Observability",
      items: [
+        "observability/langfuse_integration",
+        "observability/logfire_integration",
        "debugging/local_debugging",
        "observability/raw_request_response",
-        "observability/callbacks",
        "observability/custom_callback",
-        "observability/langfuse_integration",
        "observability/sentry",
        "observability/lago",
        "observability/openmeter",
@ -223,14 +224,16 @@ const sidebars = {
    },
    {
      type: "category",
-      label: "LangChain, LlamaIndex Integration",
-      items: ["langchain/langchain"],
+      label: "LangChain, LlamaIndex, Instructor Integration",
+      items: ["langchain/langchain", "tutorials/instructor"],
    },
    {
      type: "category",
      label: "Extras",
      items: [
        "extras/contributing",
+        "contributing",
+        "rules",
        "proxy_server",
        {
          type: "category",
--- a/docs/my-website/yarn.lock
+++ b/docs/my-website/yarn.lock
--- a/enterprise/enterprise_hooks/banned_keywords.py
+++ b/enterprise/enterprise_hooks/banned_keywords.py
@ -93,7 +93,7 @@ class _ENTERPRISE_BannedKeywords(CustomLogger):
            response.choices[0], litellm.utils.Choices
        ):
            for word in self.banned_keywords_list:
-                self.test_violation(test_str=response.choices[0].message.content)
+                self.test_violation(test_str=response.choices[0].message.content or "")

    async def async_post_call_streaming_hook(
        self,
--- a/enterprise/utils.py
+++ b/enterprise/utils.py
@ -122,236 +122,6 @@ async def ui_get_spend_by_tags(
    return {"spend_per_tag": ui_tags}


-async def view_spend_logs_from_clickhouse(
-    api_key=None, user_id=None, request_id=None, start_date=None, end_date=None
-):
-    verbose_logger.debug("Reading logs from Clickhouse")
-    import os
-
-    # if user has setup clickhouse
-    # TODO: Move this to be a helper function
-    # querying clickhouse for this data
-    import clickhouse_connect
-    from datetime import datetime
-
-    port = os.getenv("CLICKHOUSE_PORT")
-    if port is not None and isinstance(port, str):
-        port = int(port)
-
-    client = clickhouse_connect.get_client(
-        host=os.getenv("CLICKHOUSE_HOST"),
-        port=port,
-        username=os.getenv("CLICKHOUSE_USERNAME", ""),
-        password=os.getenv("CLICKHOUSE_PASSWORD", ""),
-    )
-    if (
-        start_date is not None
-        and isinstance(start_date, str)
-        and end_date is not None
-        and isinstance(end_date, str)
-    ):
-        # Convert the date strings to datetime objects
-        start_date_obj = datetime.strptime(start_date, "%Y-%m-%d")
-        end_date_obj = datetime.strptime(end_date, "%Y-%m-%d")
-
-        # get top spend per day
-        response = client.query(
-            f"""
-                SELECT
-                    toDate(startTime) AS day,
-                    sum(spend) AS total_spend
-                FROM
-                    spend_logs
-                WHERE
-                    toDate(startTime) BETWEEN toDate('2024-02-01') AND toDate('2024-02-29')
-                GROUP BY
-                    day
-                ORDER BY
-                    total_spend
-                """
-        )
-
-        results = []
-        result_rows = list(response.result_rows)
-        for response in result_rows:
-            current_row = {}
-            current_row["users"] = {"example": 0.0}
-            current_row["models"] = {}
-
-            current_row["spend"] = float(response[1])
-            current_row["startTime"] = str(response[0])
-
-            # stubbed api_key
-            current_row[""] = 0.0  # type: ignore
-            results.append(current_row)
-
-        return results
-    else:
-        # check if spend logs exist, if it does then return last 10 logs, sorted in descending order of startTime
-        response = client.query(
-            """
-                SELECT
-                    *
-                FROM
-                    default.spend_logs
-                ORDER BY
-                    startTime DESC
-                LIMIT
-                    10
-            """
-        )
-
-        # get size of spend logs
-        num_rows = client.query("SELECT count(*) FROM default.spend_logs")
-        num_rows = num_rows.result_rows[0][0]
-
-        # safely access num_rows.result_rows[0][0]
-        if num_rows is None:
-            num_rows = 0
-
-        raw_rows = list(response.result_rows)
-        response_data = {
-            "logs": raw_rows,
-            "log_count": num_rows,
-        }
-        return response_data
-
-
-def _create_clickhouse_material_views(client=None, table_names=[]):
-    # Create Materialized Views if they don't exist
-    # Materialized Views send new inserted rows to the aggregate tables
-
-    verbose_logger.debug("Clickhouse: Creating Materialized Views")
-    if "daily_aggregated_spend_per_model_mv" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model_mv")
-        client.command(
-            """
-            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_model_mv
-            TO daily_aggregated_spend_per_model
-            AS
-            SELECT
-                toDate(startTime) as day,
-                sumState(spend) AS DailySpend,
-                model as model
-            FROM spend_logs
-            GROUP BY
-                day, model
-            """
-        )
-    if "daily_aggregated_spend_per_api_key_mv" not in table_names:
-        verbose_logger.debug(
-            "Clickhouse: Creating daily_aggregated_spend_per_api_key_mv"
-        )
-        client.command(
-            """
-            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_api_key_mv
-            TO daily_aggregated_spend_per_api_key
-            AS
-            SELECT
-                toDate(startTime) as day,
-                sumState(spend) AS DailySpend,
-                api_key as api_key
-            FROM spend_logs
-            GROUP BY
-                day, api_key
-            """
-        )
-    if "daily_aggregated_spend_per_user_mv" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user_mv")
-        client.command(
-            """
-            CREATE MATERIALIZED VIEW daily_aggregated_spend_per_user_mv
-            TO daily_aggregated_spend_per_user
-            AS
-            SELECT
-                toDate(startTime) as day,
-                sumState(spend) AS DailySpend,
-                user as user
-            FROM spend_logs
-            GROUP BY
-                day, user
-            """
-        )
-    if "daily_aggregated_spend_mv" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_mv")
-        client.command(
-            """
-            CREATE MATERIALIZED VIEW daily_aggregated_spend_mv
-            TO daily_aggregated_spend
-            AS
-            SELECT
-                toDate(startTime) as day,
-                sumState(spend) AS DailySpend
-            FROM spend_logs
-            GROUP BY
-                day
-            """
-        )
-
-
-def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
-    # Basic Logging works without this - this is only used for low latency reporting apis
-    verbose_logger.debug("Clickhouse: Creating Aggregate Tables")
-
-    # Create Aggregeate Tables if they don't exist
-    if "daily_aggregated_spend_per_model" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_model")
-        client.command(
-            """
-            CREATE TABLE daily_aggregated_spend_per_model
-            (
-                `day` Date,
-                `DailySpend` AggregateFunction(sum, Float64),
-                `model` String
-            )
-            ENGINE = SummingMergeTree()
-            ORDER BY (day, model);
-            """
-        )
-    if "daily_aggregated_spend_per_api_key" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_api_key")
-        client.command(
-            """
-            CREATE TABLE daily_aggregated_spend_per_api_key
-            (
-                `day` Date,
-                `DailySpend` AggregateFunction(sum, Float64),
-                `api_key` String
-            )
-            ENGINE = SummingMergeTree()
-            ORDER BY (day, api_key);
-            """
-        )
-    if "daily_aggregated_spend_per_user" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend_per_user")
-        client.command(
-            """
-            CREATE TABLE daily_aggregated_spend_per_user
-            (
-                `day` Date,
-                `DailySpend` AggregateFunction(sum, Float64),
-                `user` String
-            )
-            ENGINE = SummingMergeTree()
-            ORDER BY (day, user);
-            """
-        )
-    if "daily_aggregated_spend" not in table_names:
-        verbose_logger.debug("Clickhouse: Creating daily_aggregated_spend")
-        client.command(
-            """
-            CREATE TABLE daily_aggregated_spend
-            (
-                `day` Date,
-                `DailySpend` AggregateFunction(sum, Float64),
-            )
-            ENGINE = SummingMergeTree()
-            ORDER BY (day);
-            """
-        )
-    return
-
-
 def _forecast_daily_cost(data: list):
    import requests  # type: ignore
    from datetime import datetime, timedelta
--- a/litellm/init.py
+++ b/litellm/init.py
@ -13,7 +13,10 @@ from litellm._logging import (
    verbose_logger,
    json_logs,
    _turn_on_json,
+    log_level,
 )
+
+
 from litellm.proxy._types import (
    KeyManagementSystem,
    KeyManagementSettings,
@ -34,7 +37,7 @@ input_callback: List[Union[str, Callable]] = []
 success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
-_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter"]
+_custom_logger_compatible_callbacks_literal = Literal["lago", "openmeter", "logfire"]
 callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
 _langfuse_default_tags: Optional[
    List[
@ -73,7 +76,7 @@ token: Optional[str] = (
 )
 telemetry = True
 max_tokens = 256  # OpenAI Defaults
-drop_params = False
+drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
 modify_params = False
 retry = True
 ### AUTH ###
@ -240,6 +243,7 @@ num_retries: Optional[int] = None  # per model endpoint
 default_fallbacks: Optional[List] = None
 fallbacks: Optional[List] = None
 context_window_fallbacks: Optional[List] = None
+content_policy_fallbacks: Optional[List] = None
 allowed_fails: int = 0
 num_retries_per_request: Optional[int] = (
    None  # for the request overall (incl. fallbacks + model retries)
@ -337,6 +341,7 @@ bedrock_models: List = []
 deepinfra_models: List = []
 perplexity_models: List = []
 watsonx_models: List = []
+gemini_models: List = []
 for key, value in model_cost.items():
    if value.get("litellm_provider") == "openai":
        open_ai_chat_completion_models.append(key)
@ -383,13 +388,16 @@ for key, value in model_cost.items():
        perplexity_models.append(key)
    elif value.get("litellm_provider") == "watsonx":
        watsonx_models.append(key)
-
+    elif value.get("litellm_provider") == "gemini":
+        gemini_models.append(key)
 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
 openai_compatible_endpoints: List = [
    "api.perplexity.ai",
    "api.endpoints.anyscale.com/v1",
    "api.deepinfra.com/v1/openai",
    "api.mistral.ai/v1",
+    "codestral.mistral.ai/v1/chat/completions",
+    "codestral.mistral.ai/v1/fim/completions",
    "api.groq.com/openai/v1",
    "api.deepseek.com/v1",
    "api.together.xyz/v1",
@ -401,6 +409,7 @@ openai_compatible_providers: List = [
    "anyscale",
    "mistral",
    "groq",
+    "codestral",
    "deepseek",
    "deepinfra",
    "perplexity",
@ -592,6 +601,7 @@ model_list = (
    + maritalk_models
    + vertex_language_models
    + watsonx_models
+    + gemini_models
 )

 provider_list: List = [
@ -607,6 +617,7 @@ provider_list: List = [
    "together_ai",
    "openrouter",
    "vertex_ai",
+    "vertex_ai_beta",
    "palm",
    "gemini",
    "ai21",
@ -627,6 +638,8 @@ provider_list: List = [
    "anyscale",
    "mistral",
    "groq",
+    "codestral",
+    "text-completion-codestral",
    "deepseek",
    "maritalk",
    "voyage",
@ -664,6 +677,7 @@ models_by_provider: dict = {
    "perplexity": perplexity_models,
    "maritalk": maritalk_models,
    "watsonx": watsonx_models,
+    "gemini": gemini_models,
 }

 # mapping for those models which have larger equivalents
@ -716,6 +730,7 @@ openai_image_generation_models = ["dall-e-2", "dall-e-3"]

 from .timeout import timeout
 from .cost_calculator import completion_cost
+from litellm.litellm_core_utils.litellm_logging import Logging
 from .utils import (
    client,
    exception_type,
@ -724,12 +739,11 @@ from .utils import (
    token_counter,
    create_pretrained_tokenizer,
    create_tokenizer,
-    cost_per_token,
    supports_function_calling,
    supports_parallel_function_calling,
    supports_vision,
+    supports_system_messages,
    get_litellm_params,
-    Logging,
    acreate,
    get_model_list,
    get_max_tokens,
@ -749,9 +763,10 @@ from .utils import (
    get_first_chars_messages,
    ModelResponse,
    ImageResponse,
-    ImageObject,
    get_provider_fields,
 )
+
+from .types.utils import ImageObject
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
@ -768,6 +783,7 @@ from .llms.gemini import GeminiConfig
 from .llms.nlp_cloud import NLPCloudConfig
 from .llms.aleph_alpha import AlephAlphaConfig
 from .llms.petals import PetalsConfig
+from .llms.vertex_httpx import VertexGeminiConfig
 from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
 from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
 from .llms.sagemaker import SagemakerConfig
@ -792,7 +808,9 @@ from .llms.openai import (
    MistralConfig,
    MistralEmbeddingConfig,
    DeepInfraConfig,
+    AzureAIStudioConfig,
 )
+from .llms.text_completion_codestral import MistralTextCompletionConfig
 from .llms.azure import (
    AzureOpenAIConfig,
    AzureOpenAIError,
@ -826,4 +844,4 @@ from .router import Router
 from .assistants.main import *
 from .batches.main import *
 from .scheduler import *
-from .cost_calculator import response_cost_calculator
+from .cost_calculator import response_cost_calculator, cost_per_token
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -1,21 +1,40 @@
-import logging, os, json
-from logging import Formatter
+import json
+import logging
+import os
 import traceback
+from datetime import datetime
+from logging import Formatter

 set_verbose = False
+
+if set_verbose is True:
+    logging.warning(
+        "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."
+    )
 json_logs = bool(os.getenv("JSON_LOGS", False))
 # Create a handler for the logger (you may need to adapt this based on your needs)
+log_level = os.getenv("LITELLM_LOG", "DEBUG")
+numeric_level: str = getattr(logging, log_level.upper())
 handler = logging.StreamHandler()
-handler.setLevel(logging.DEBUG)
+handler.setLevel(numeric_level)


 class JsonFormatter(Formatter):
    def __init__(self):
        super(JsonFormatter, self).__init__()

+    def formatTime(self, record, datefmt=None):
+        # Use datetime to format the timestamp in ISO 8601 format
+        dt = datetime.fromtimestamp(record.created)
+        return dt.isoformat()
+
    def format(self, record):
-        json_record = {}
-        json_record["message"] = record.getMessage()
+        json_record = {
+            "message": record.getMessage(),
+            "level": record.levelname,
+            "timestamp": self.formatTime(record),
+        }
+
        return json.dumps(json_record)


--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -1192,7 +1192,7 @@ class S3Cache(BaseCache):
            return cached_response
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
-                verbose_logger.error(
+                verbose_logger.debug(
                    f"S3 Cache: The specified key '{key}' does not exist in the S3 bucket."
                )
                return None
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -1,21 +1,292 @@
 # What is this?
 ## File for 'response_cost' calculation in Logging
-from typing import Optional, Union, Literal, List
+import time
+from typing import List, Literal, Optional, Tuple, Union
+
+import litellm
 import litellm._logging
+from litellm import verbose_logger
+from litellm.litellm_core_utils.llm_cost_calc.google import (
+    cost_per_character as google_cost_per_character,
+)
+from litellm.litellm_core_utils.llm_cost_calc.google import (
+    cost_per_token as google_cost_per_token,
+)
 from litellm.utils import (
-    ModelResponse,
+    CallTypes,
+    CostPerToken,
    EmbeddingResponse,
    ImageResponse,
-    TranscriptionResponse,
+    ModelResponse,
    TextCompletionResponse,
-    CallTypes,
-    cost_per_token,
+    TranscriptionResponse,
    print_verbose,
-    CostPerToken,
    token_counter,
 )
-import litellm
-from litellm import verbose_logger
+
+
+def _cost_per_token_custom_pricing_helper(
+    prompt_tokens: float = 0,
+    completion_tokens: float = 0,
+    response_time_ms=None,
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+) -> Optional[Tuple[float, float]]:
+    """Internal helper function for calculating cost, if custom pricing given"""
+    if custom_cost_per_token is None and custom_cost_per_second is None:
+        return None
+
+    if custom_cost_per_token is not None:
+        input_cost = custom_cost_per_token["input_cost_per_token"] * prompt_tokens
+        output_cost = custom_cost_per_token["output_cost_per_token"] * completion_tokens
+        return input_cost, output_cost
+    elif custom_cost_per_second is not None:
+        output_cost = custom_cost_per_second * response_time_ms / 1000  # type: ignore
+        return 0, output_cost
+
+    return None
+
+
+def cost_per_token(
+    model: str = "",
+    prompt_tokens: float = 0,
+    completion_tokens: float = 0,
+    response_time_ms=None,
+    custom_llm_provider: Optional[str] = None,
+    region_name=None,
+    ### CHARACTER PRICING ###
+    prompt_characters: float = 0,
+    completion_characters: float = 0,
+    ### CUSTOM PRICING ###
+    custom_cost_per_token: Optional[CostPerToken] = None,
+    custom_cost_per_second: Optional[float] = None,
+) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Parameters:
+        model (str): The name of the model to use. Default is ""
+        prompt_tokens (int): The number of tokens in the prompt.
+        completion_tokens (int): The number of tokens in the completion.
+        response_time (float): The amount of time, in milliseconds, it took the call to complete.
+        prompt_characters (float): The number of characters in the prompt. Used for vertex ai cost calculation.
+        completion_characters (float): The number of characters in the completion response. Used for vertex ai cost calculation.
+        custom_llm_provider (str): The llm provider to whom the call was made (see init.py for full list)
+        custom_cost_per_token: Optional[CostPerToken]: the cost per input + output token for the llm api call.
+        custom_cost_per_second: Optional[float]: the cost per second for the llm api call.
+
+    Returns:
+        tuple: A tuple containing the cost in USD dollars for prompt tokens and completion tokens, respectively.
+    """
+    args = locals()
+    if model is None:
+        raise Exception("Invalid arg. Model cannot be none.")
+    ## CUSTOM PRICING ##
+    response_cost = _cost_per_token_custom_pricing_helper(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        response_time_ms=response_time_ms,
+        custom_cost_per_second=custom_cost_per_second,
+        custom_cost_per_token=custom_cost_per_token,
+    )
+    if response_cost is not None:
+        return response_cost[0], response_cost[1]
+
+    # given
+    prompt_tokens_cost_usd_dollar: float = 0
+    completion_tokens_cost_usd_dollar: float = 0
+    model_cost_ref = litellm.model_cost
+    model_with_provider = model
+    if custom_llm_provider is not None:
+        model_with_provider = custom_llm_provider + "/" + model
+        if region_name is not None:
+            model_with_provider_and_region = (
+                f"{custom_llm_provider}/{region_name}/{model}"
+            )
+            if (
+                model_with_provider_and_region in model_cost_ref
+            ):  # use region based pricing, if it's available
+                model_with_provider = model_with_provider_and_region
+    else:
+        _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+    model_without_prefix = model
+    model_parts = model.split("/")
+    if len(model_parts) > 1:
+        model_without_prefix = model_parts[1]
+    else:
+        model_without_prefix = model
+    """
+    Code block that formats model to lookup in litellm.model_cost
+    Option1. model = "bedrock/ap-northeast-1/anthropic.claude-instant-v1". This is the most accurate since it is region based. Should always be option 1
+    Option2. model = "openai/gpt-4"       - model = provider/model
+    Option3. model = "anthropic.claude-3" - model = model
+    """
+    if (
+        model_with_provider in model_cost_ref
+    ):  # Option 2. use model with provider, model = "openai/gpt-4"
+        model = model_with_provider
+    elif model in model_cost_ref:  # Option 1. use model passed, model="gpt-4"
+        model = model
+    elif (
+        model_without_prefix in model_cost_ref
+    ):  # Option 3. if user passed model="bedrock/anthropic.claude-3", use model="anthropic.claude-3"
+        model = model_without_prefix
+
+    # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
+    print_verbose(f"Looking up model={model} in model_cost_map")
+    if custom_llm_provider == "vertex_ai":
+        return google_cost_per_character(
+            model=model_without_prefix,
+            custom_llm_provider=custom_llm_provider,
+            prompt_characters=prompt_characters,
+            completion_characters=completion_characters,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        )
+    elif custom_llm_provider == "gemini":
+        return google_cost_per_token(
+            model=model_without_prefix,
+            custom_llm_provider=custom_llm_provider,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        )
+    elif model in model_cost_ref:
+        print_verbose(f"Success: model={model} in model_cost_map")
+        print_verbose(
+            f"prompt_tokens={prompt_tokens}; completion_tokens={completion_tokens}"
+        )
+        if (
+            model_cost_ref[model].get("input_cost_per_token", None) is not None
+            and model_cost_ref[model].get("output_cost_per_token", None) is not None
+        ):
+            ## COST PER TOKEN ##
+            prompt_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+            )
+            completion_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+            )
+        elif (
+            model_cost_ref[model].get("output_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            print_verbose(
+                f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}"
+            )
+            ## COST PER SECOND ##
+            prompt_tokens_cost_usd_dollar = 0
+            completion_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["output_cost_per_second"]
+                * response_time_ms
+                / 1000
+            )
+        elif (
+            model_cost_ref[model].get("input_cost_per_second", None) is not None
+            and response_time_ms is not None
+        ):
+            print_verbose(
+                f"For model={model} - input_cost_per_second: {model_cost_ref[model].get('input_cost_per_second')}; response time: {response_time_ms}"
+            )
+            ## COST PER SECOND ##
+            prompt_tokens_cost_usd_dollar = (
+                model_cost_ref[model]["input_cost_per_second"] * response_time_ms / 1000
+            )
+            completion_tokens_cost_usd_dollar = 0.0
+        print_verbose(
+            f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif "ft:gpt-3.5-turbo" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-3.5-turbo"]["output_cost_per_token"]
+            * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif "ft:gpt-4-0613" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        # fuzzy match ft:gpt-4-0613:abcd-id-cool-litellm
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-4-0613"]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-4-0613"]["output_cost_per_token"] * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif "ft:gpt-4o-2024-05-13" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        # fuzzy match ft:gpt-4o-2024-05-13:abcd-id-cool-litellm
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-4o-2024-05-13"]["input_cost_per_token"]
+            * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:gpt-4o-2024-05-13"]["output_cost_per_token"]
+            * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+
+    elif "ft:davinci-002" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        # fuzzy match ft:davinci-002:abcd-id-cool-litellm
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:davinci-002"]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:davinci-002"]["output_cost_per_token"]
+            * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif "ft:babbage-002" in model:
+        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        # fuzzy match ft:babbage-002:abcd-id-cool-litellm
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:babbage-002"]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref["ft:babbage-002"]["output_cost_per_token"]
+            * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif model in litellm.azure_llms:
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
+        model = litellm.azure_llms[model]
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
+        )
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+        )
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    elif model in litellm.azure_embedding_models:
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
+        model = litellm.azure_embedding_models[model]
+        prompt_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
+        )
+        completion_tokens_cost_usd_dollar = (
+            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
+        )
+        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
+    else:
+        # if model is not in model_prices_and_context_window.json. Raise an exception-let users know
+        error_str = f"Model not in model_prices_and_context_window.json. You passed model={model}. Register pricing for model - https://docs.litellm.ai/docs/proxy/custom_pricing\n"
+        raise litellm.exceptions.NotFoundError(  # type: ignore
+            message=error_str,
+            model=model,
+            llm_provider="",
+        )


 # Extract the number of billion parameters from the model name
@ -147,7 +418,9 @@ def completion_cost(
            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
        # Handle Inputs to completion_cost
        prompt_tokens = 0
+        prompt_characters = 0
        completion_tokens = 0
+        completion_characters = 0
        custom_llm_provider = None
        if completion_response is not None:
            # get input/output tokens from completion_response
@ -264,6 +537,30 @@ def completion_cost(
                f"Model is None and does not exist in passed completion_response. Passed completion_response={completion_response}, model={model}"
            )

+        if (
+            custom_llm_provider is not None
+            and custom_llm_provider == "vertex_ai"
+            and completion_response is not None
+            and isinstance(completion_response, ModelResponse)
+        ):
+            # Calculate the prompt characters + response characters
+            if len("messages") > 0:
+                prompt_string = litellm.utils.get_formatted_prompt(
+                    data={"messages": messages}, call_type="completion"
+                )
+            else:
+                prompt_string = ""
+
+            prompt_characters = litellm.utils._count_characters(text=prompt_string)
+
+            completion_string = litellm.utils.get_response_string(
+                response_obj=completion_response
+            )
+
+            completion_characters = litellm.utils._count_characters(
+                text=completion_string
+            )
+
        (
            prompt_tokens_cost_usd_dollar,
            completion_tokens_cost_usd_dollar,
@ -276,6 +573,8 @@ def completion_cost(
            region_name=region_name,
            custom_cost_per_second=custom_cost_per_second,
            custom_cost_per_token=custom_cost_per_token,
+            prompt_characters=prompt_characters,
+            completion_characters=completion_characters,
        )
        _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
        print_verbose(
--- a/litellm/exceptions.py
+++ b/litellm/exceptions.py
@ -26,7 +26,7 @@ class AuthenticationError(openai.AuthenticationError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 401
-        self.message = message
+        self.message = "litellm.AuthenticationError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -72,7 +72,7 @@ class NotFoundError(openai.NotFoundError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 404
-        self.message = message
+        self.message = "litellm.NotFoundError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
@ -117,7 +117,7 @@ class BadRequestError(openai.BadRequestError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 400
-        self.message = message
+        self.message = "litellm.BadRequestError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
@ -162,7 +162,7 @@ class UnprocessableEntityError(openai.UnprocessableEntityError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 422
-        self.message = message
+        self.message = "litellm.UnprocessableEntityError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
@ -204,7 +204,7 @@ class Timeout(openai.APITimeoutError):  # type: ignore
            request=request
        )  # Call the base class constructor with the parameters it needs
        self.status_code = 408
-        self.message = message
+        self.message = "litellm.Timeout: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
@ -241,7 +241,7 @@ class PermissionDeniedError(openai.PermissionDeniedError):  # type:ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 403
-        self.message = message
+        self.message = "litellm.PermissionDeniedError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -280,7 +280,7 @@ class RateLimitError(openai.RateLimitError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 429
-        self.message = message
+        self.message = "litellm.RateLimitError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -324,19 +324,21 @@ class ContextWindowExceededError(BadRequestError):  # type: ignore
        message,
        model,
        llm_provider,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
    ):
        self.status_code = 400
-        self.message = message
+        self.message = "litellm.ContextWindowExceededError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        self.response = response or httpx.Response(status_code=400, request=request)
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
            llm_provider=self.llm_provider,  # type: ignore
-            response=response,
+            response=self.response,
            litellm_debug_info=self.litellm_debug_info,
        )  # Call the base class constructor with the parameters it needs

@ -368,7 +370,7 @@ class RejectedRequestError(BadRequestError):  # type: ignore
        litellm_debug_info: Optional[str] = None,
    ):
        self.status_code = 400
-        self.message = message
+        self.message = "litellm.RejectedRequestError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
@ -407,19 +409,21 @@ class ContentPolicyViolationError(BadRequestError):  # type: ignore
        message,
        model,
        llm_provider,
-        response: httpx.Response,
+        response: Optional[httpx.Response] = None,
        litellm_debug_info: Optional[str] = None,
    ):
        self.status_code = 400
-        self.message = message
+        self.message = "litellm.ContentPolicyViolationError: {}".format(message)
        self.model = model
        self.llm_provider = llm_provider
        self.litellm_debug_info = litellm_debug_info
+        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
+        self.response = response or httpx.Response(status_code=500, request=request)
        super().__init__(
            message=self.message,
            model=self.model,  # type: ignore
            llm_provider=self.llm_provider,  # type: ignore
-            response=response,
+            response=self.response,
            litellm_debug_info=self.litellm_debug_info,
        )  # Call the base class constructor with the parameters it needs

@ -452,7 +456,7 @@ class ServiceUnavailableError(openai.APIStatusError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 503
-        self.message = message
+        self.message = "litellm.ServiceUnavailableError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -501,7 +505,7 @@ class InternalServerError(openai.InternalServerError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = 500
-        self.message = message
+        self.message = "litellm.InternalServerError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -552,7 +556,7 @@ class APIError(openai.APIError):  # type: ignore
        num_retries: Optional[int] = None,
    ):
        self.status_code = status_code
-        self.message = message
+        self.message = "litellm.APIError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.litellm_debug_info = litellm_debug_info
@ -589,7 +593,7 @@ class APIConnectionError(openai.APIConnectionError):  # type: ignore
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
    ):
-        self.message = message
+        self.message = "litellm.APIConnectionError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        self.status_code = 500
@ -626,7 +630,7 @@ class APIResponseValidationError(openai.APIResponseValidationError):  # type: ig
        max_retries: Optional[int] = None,
        num_retries: Optional[int] = None,
    ):
-        self.message = message
+        self.message = "litellm.APIResponseValidationError: {}".format(message)
        self.llm_provider = llm_provider
        self.model = model
        request = httpx.Request(method="POST", url="https://api.openai.com/v1")
--- a/litellm/integrations/clickhouse.py
+++ b/litellm/integrations/clickhouse.py
@ -226,14 +226,6 @@ def _start_clickhouse():
            response = client.query("DESCRIBE default.spend_logs")
            verbose_logger.debug(f"spend logs schema ={response.result_rows}")

-        # RUN Enterprise Clickhouse Setup
-        # TLDR: For Enterprise - we create views / aggregate tables for low latency reporting APIs
-        from litellm.proxy.enterprise.utils import _create_clickhouse_aggregate_tables
-        from litellm.proxy.enterprise.utils import _create_clickhouse_material_views
-
-        _create_clickhouse_aggregate_tables(client=client, table_names=table_names)
-        _create_clickhouse_material_views(client=client, table_names=table_names)
-

 class ClickhouseLogger:
    # Class variables or attributes
--- a/litellm/integrations/custom_logger.py
+++ b/litellm/integrations/custom_logger.py
@ -10,7 +10,7 @@ import traceback

 class CustomLogger:  # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
    # Class variables or attributes
-    def __init__(self):
+    def __init__(self) -> None:
        pass

    def log_pre_api_call(self, model, messages, kwargs):
--- a/litellm/integrations/email_alerting.py
+++ b/litellm/integrations/email_alerting.py
@ -0,0 +1,136 @@
+"""
+Functions for sending Email Alerts
+"""
+
+import os
+from typing import Optional, List
+from litellm.proxy._types import WebhookEvent
+import asyncio
+from litellm._logging import verbose_logger, verbose_proxy_logger
+
+# we use this for the email header, please send a test email if you change this. verify it looks good on email
+LITELLM_LOGO_URL = "https://litellm-listing.s3.amazonaws.com/litellm_logo.png"
+LITELLM_SUPPORT_CONTACT = "support@berri.ai"
+
+
+async def get_all_team_member_emails(team_id: Optional[str] = None) -> list:
+    verbose_logger.debug(
+        "Email Alerting: Getting all team members for team_id=%s", team_id
+    )
+    if team_id is None:
+        return []
+    from litellm.proxy.proxy_server import premium_user, prisma_client
+
+    if prisma_client is None:
+        raise Exception("Not connected to DB!")
+
+    team_row = await prisma_client.db.litellm_teamtable.find_unique(
+        where={
+            "team_id": team_id,
+        }
+    )
+
+    if team_row is None:
+        return []
+
+    _team_members = team_row.members_with_roles
+    verbose_logger.debug(
+        "Email Alerting: Got team members for team_id=%s Team Members: %s",
+        team_id,
+        _team_members,
+    )
+    _team_member_user_ids: List[str] = []
+    for member in _team_members:
+        if member and isinstance(member, dict) and member.get("user_id") is not None:
+            _team_member_user_ids.append(member.get("user_id"))
+
+    sql_query = """
+        SELECT user_email
+        FROM "LiteLLM_UserTable"
+        WHERE user_id = ANY($1::TEXT[]);
+    """
+
+    _result = await prisma_client.db.query_raw(sql_query, _team_member_user_ids)
+
+    verbose_logger.debug("Email Alerting: Got all Emails for team, emails=%s", _result)
+
+    if _result is None:
+        return []
+
+    emails = []
+    for user in _result:
+        if user and isinstance(user, dict) and user.get("user_email", None) is not None:
+            emails.append(user.get("user_email"))
+    return emails
+
+
+async def send_team_budget_alert(webhook_event: WebhookEvent) -> bool:
+    """
+    Send an Email Alert to All Team Members when the Team Budget is crossed
+    Returns -> True if sent, False if not.
+    """
+    from litellm.proxy.utils import send_email
+
+    from litellm.proxy.proxy_server import premium_user, prisma_client
+
+    _team_id = webhook_event.team_id
+    team_alias = webhook_event.team_alias
+    verbose_logger.debug(
+        "Email Alerting: Sending Team Budget Alert for team=%s", team_alias
+    )
+
+    email_logo_url = os.getenv("SMTP_SENDER_LOGO", os.getenv("EMAIL_LOGO_URL", None))
+    email_support_contact = os.getenv("EMAIL_SUPPORT_CONTACT", None)
+
+    # await self._check_if_using_premium_email_feature(
+    #     premium_user, email_logo_url, email_support_contact
+    # )
+
+    if email_logo_url is None:
+        email_logo_url = LITELLM_LOGO_URL
+    if email_support_contact is None:
+        email_support_contact = LITELLM_SUPPORT_CONTACT
+    recipient_emails = await get_all_team_member_emails(_team_id)
+    recipient_emails_str: str = ",".join(recipient_emails)
+    verbose_logger.debug(
+        "Email Alerting: Sending team budget alert to %s", recipient_emails_str
+    )
+
+    event_name = webhook_event.event_message
+    max_budget = webhook_event.max_budget
+    email_html_content = "Alert from LiteLLM Server"
+
+    if recipient_emails_str is None:
+        verbose_proxy_logger.error(
+            "Email Alerting: Trying to send email alert to no recipient, got recipient_emails=%s",
+            recipient_emails_str,
+        )
+
+    email_html_content = f"""
+    <img src="{email_logo_url}" alt="LiteLLM Logo" width="150" height="50" /> <br/><br/><br/>
+
+    Budget Crossed for Team <b> {team_alias} </b> <br/> <br/>
+
+    Your Teams LLM API usage has crossed it's <b> budget of ${max_budget} </b>, current spend is <b>${webhook_event.spend}</b><br /> <br />
+
+    API requests will be rejected until either (a) you increase your budget or (b) your budget gets reset <br /> <br />
+
+    If you have any questions, please send an email to {email_support_contact} <br /> <br />
+
+    Best, <br />
+    The LiteLLM team <br />
+    """
+
+    email_event = {
+        "to": recipient_emails_str,
+        "subject": f"LiteLLM {event_name} for Team {team_alias}",
+        "html": email_html_content,
+    }
+
+    await send_email(
+        receiver_email=email_event["to"],
+        subject=email_event["subject"],
+        html=email_event["html"],
+    )
+
+    return False
--- a/litellm/integrations/lago.py
+++ b/litellm/integrations/lago.py
@ -1,13 +1,19 @@
 # What is this?
 ## On Success events log cost to Lago - https://github.com/BerriAI/litellm/issues/3639

-import dotenv, os, json
+import json
+import os
+import traceback
+import uuid
+from typing import Literal, Optional
+
+import dotenv
+import httpx
+
 import litellm
-import traceback, httpx
+from litellm import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-import uuid
-from typing import Optional, Literal


 def get_utc_datetime():
@ -143,6 +149,7 @@ class LagoLogger(CustomLogger):

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
+            verbose_logger.debug("ENTERS LAGO CALLBACK")
            _url = os.getenv("LAGO_API_BASE")
            assert _url is not None and isinstance(
                _url, str
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -1,21 +1,27 @@
 #### What this does ####
 #    On success, logs events to Langfuse
-import os
 import copy
+import os
 import traceback
+
 from packaging.version import Version
-from litellm._logging import verbose_logger
+
 import litellm
+from litellm._logging import verbose_logger


 class LangFuseLogger:
    # Class variables or attributes
    def __init__(
-        self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
+        self,
+        langfuse_public_key=None,
+        langfuse_secret=None,
+        langfuse_host=None,
+        flush_interval=1,
    ):
        try:
-            from langfuse import Langfuse
            import langfuse
+            from langfuse import Langfuse
        except Exception as e:
            raise Exception(
                f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -23,7 +29,9 @@ class LangFuseLogger:
        # Instance variables
        self.secret_key = langfuse_secret or os.getenv("LANGFUSE_SECRET_KEY")
        self.public_key = langfuse_public_key or os.getenv("LANGFUSE_PUBLIC_KEY")
-        self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
+        self.langfuse_host = langfuse_host or os.getenv(
+            "LANGFUSE_HOST", "https://cloud.langfuse.com"
+        )
        self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
        self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")

@ -167,7 +175,7 @@ class LangFuseLogger:
                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                input = prompt
-                output = response_obj["data"]
+                output = None
            elif response_obj is not None and isinstance(
                response_obj, litellm.ModelResponse
            ):
@ -251,7 +259,7 @@ class LangFuseLogger:
        input,
        response_obj,
    ):
-        from langfuse.model import CreateTrace, CreateGeneration
+        from langfuse.model import CreateGeneration, CreateTrace

        verbose_logger.warning(
            "Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
@ -528,31 +536,14 @@ class LangFuseLogger:
                "version": clean_metadata.pop("version", None),
            }

+            parent_observation_id = metadata.get("parent_observation_id", None)
+            if parent_observation_id is not None:
+                generation_params["parent_observation_id"] = parent_observation_id
+
            if supports_prompt:
-                user_prompt = clean_metadata.pop("prompt", None)
-                if user_prompt is None:
-                    pass
-                elif isinstance(user_prompt, dict):
-                    from langfuse.model import (
-                        TextPromptClient,
-                        ChatPromptClient,
-                        Prompt_Text,
-                        Prompt_Chat,
-                    )
-
-                    if user_prompt.get("type", "") == "chat":
-                        _prompt_chat = Prompt_Chat(**user_prompt)
-                        generation_params["prompt"] = ChatPromptClient(
-                            prompt=_prompt_chat
-                        )
-                    elif user_prompt.get("type", "") == "text":
-                        _prompt_text = Prompt_Text(**user_prompt)
-                        generation_params["prompt"] = TextPromptClient(
-                            prompt=_prompt_text
-                        )
-                else:
-                    generation_params["prompt"] = user_prompt
-
+                generation_params = _add_prompt_to_generation_params(
+                    generation_params=generation_params, clean_metadata=clean_metadata
+                )
            if output is not None and isinstance(output, str) and level == "ERROR":
                generation_params["status_message"] = output

@ -565,5 +556,58 @@ class LangFuseLogger:

            return generation_client.trace_id, generation_id
        except Exception as e:
-            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
+            verbose_logger.error(f"Langfuse Layer Error - {traceback.format_exc()}")
            return None, None
+
+
+def _add_prompt_to_generation_params(
+    generation_params: dict, clean_metadata: dict
+) -> dict:
+    from langfuse.model import (
+        ChatPromptClient,
+        Prompt_Chat,
+        Prompt_Text,
+        TextPromptClient,
+    )
+
+    user_prompt = clean_metadata.pop("prompt", None)
+    if user_prompt is None:
+        pass
+    elif isinstance(user_prompt, dict):
+        if user_prompt.get("type", "") == "chat":
+            _prompt_chat = Prompt_Chat(**user_prompt)
+            generation_params["prompt"] = ChatPromptClient(prompt=_prompt_chat)
+        elif user_prompt.get("type", "") == "text":
+            _prompt_text = Prompt_Text(**user_prompt)
+            generation_params["prompt"] = TextPromptClient(prompt=_prompt_text)
+        elif "version" in user_prompt and "prompt" in user_prompt:
+            # prompts
+            if isinstance(user_prompt["prompt"], str):
+                _prompt_obj = Prompt_Text(
+                    name=user_prompt["name"],
+                    prompt=user_prompt["prompt"],
+                    version=user_prompt["version"],
+                    config=user_prompt.get("config", None),
+                )
+                generation_params["prompt"] = TextPromptClient(prompt=_prompt_obj)
+
+            elif isinstance(user_prompt["prompt"], list):
+                _prompt_obj = Prompt_Chat(
+                    name=user_prompt["name"],
+                    prompt=user_prompt["prompt"],
+                    version=user_prompt["version"],
+                    config=user_prompt.get("config", None),
+                )
+                generation_params["prompt"] = ChatPromptClient(prompt=_prompt_obj)
+            else:
+                verbose_logger.error(
+                    "[Non-blocking] Langfuse Logger: Invalid prompt format"
+                )
+        else:
+            verbose_logger.error(
+                "[Non-blocking] Langfuse Logger: Invalid prompt format. No prompt logged to Langfuse"
+            )
+    else:
+        generation_params["prompt"] = user_prompt
+
+    return generation_params
--- a/litellm/integrations/lunary.py
+++ b/litellm/integrations/lunary.py
@ -105,7 +105,6 @@ class LunaryLogger:
        end_time=datetime.now(timezone.utc),
        error=None,
    ):
-        # Method definition
        try:
            print_verbose(f"Lunary Logging - Logging request for model {model}")

@ -114,10 +113,9 @@ class LunaryLogger:
            metadata = litellm_params.get("metadata", {}) or {}

            if optional_params:
-                # merge into extra
                extra = {**extra, **optional_params}

-            tags = litellm_params.pop("tags", None) or []
+            tags = metadata.get("tags", None) 

            if extra:
                extra.pop("extra_body", None)
--- a/litellm/integrations/opentelemetry.py
+++ b/litellm/integrations/opentelemetry.py
@ -1,20 +1,21 @@
 import os
 from dataclasses import dataclass
 from datetime import datetime
-import litellm
-
-from litellm.integrations.custom_logger import CustomLogger
-from litellm._logging import verbose_logger
-from litellm.types.services import ServiceLoggerPayload
 from functools import wraps
-from typing import Union, Optional, TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.types.services import ServiceLoggerPayload

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
-    from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth
+
    from litellm.proxy._types import (
        ManagementEndpointLoggingPayload as _ManagementEndpointLoggingPayload,
    )
+    from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth

    Span = _Span
    UserAPIKeyAuth = _UserAPIKeyAuth
@ -107,8 +108,9 @@ class OpenTelemetry(CustomLogger):
        start_time: Optional[datetime] = None,
        end_time: Optional[datetime] = None,
    ):
-        from opentelemetry import trace
        from datetime import datetime
+
+        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

        _start_time_ns = start_time
@ -145,8 +147,9 @@ class OpenTelemetry(CustomLogger):
        start_time: Optional[datetime] = None,
        end_time: Optional[datetime] = None,
    ):
-        from opentelemetry import trace
        from datetime import datetime
+
+        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

        _start_time_ns = start_time
@ -179,8 +182,8 @@ class OpenTelemetry(CustomLogger):
    async def async_post_call_failure_hook(
        self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
    ):
-        from opentelemetry.trace import Status, StatusCode
        from opentelemetry import trace
+        from opentelemetry.trace import Status, StatusCode

        parent_otel_span = user_api_key_dict.parent_otel_span
        if parent_otel_span is not None:
@ -202,8 +205,8 @@ class OpenTelemetry(CustomLogger):
            parent_otel_span.end(end_time=self._to_ns(datetime.now()))

    def _handle_sucess(self, kwargs, response_obj, start_time, end_time):
-        from opentelemetry.trace import Status, StatusCode
        from opentelemetry import trace
+        from opentelemetry.trace import Status, StatusCode

        verbose_logger.debug(
            "OpenTelemetry Logger: Logging kwargs: %s, OTEL config settings=%s",
@ -253,9 +256,10 @@ class OpenTelemetry(CustomLogger):
        span.end(end_time=self._to_ns(end_time))

    def set_tools_attributes(self, span: Span, tools):
-        from litellm.proxy._types import SpanAttributes
        import json

+        from litellm.proxy._types import SpanAttributes
+
        if not tools:
            return

@ -320,7 +324,7 @@ class OpenTelemetry(CustomLogger):
            )

        span.set_attribute(
-            SpanAttributes.LLM_IS_STREAMING, optional_params.get("stream", False)
+            SpanAttributes.LLM_IS_STREAMING, str(optional_params.get("stream", False))
        )

        if optional_params.get("tools"):
@ -439,7 +443,7 @@ class OpenTelemetry(CustomLogger):
        #############################################
        ########## LLM Response Attributes ##########
        #############################################
-        if _raw_response:
+        if _raw_response and isinstance(_raw_response, str):
            # cast sr -> dict
            import json

@ -478,10 +482,10 @@ class OpenTelemetry(CustomLogger):
        return _parent_context

    def _get_span_context(self, kwargs):
+        from opentelemetry import trace
        from opentelemetry.trace.propagation.tracecontext import (
            TraceContextTextMapPropagator,
        )
-        from opentelemetry import trace

        litellm_params = kwargs.get("litellm_params", {}) or {}
        proxy_server_request = litellm_params.get("proxy_server_request", {}) or {}
@ -505,17 +509,17 @@ class OpenTelemetry(CustomLogger):
            return TraceContextTextMapPropagator().extract(carrier=carrier), None

    def _get_span_processor(self):
-        from opentelemetry.sdk.trace.export import (
-            SpanExporter,
-            SimpleSpanProcessor,
-            BatchSpanProcessor,
-            ConsoleSpanExporter,
+        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+            OTLPSpanExporter as OTLPSpanExporterGRPC,
        )
        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
            OTLPSpanExporter as OTLPSpanExporterHTTP,
        )
-        from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
-            OTLPSpanExporter as OTLPSpanExporterGRPC,
+        from opentelemetry.sdk.trace.export import (
+            BatchSpanProcessor,
+            ConsoleSpanExporter,
+            SimpleSpanProcessor,
+            SpanExporter,
        )

        verbose_logger.debug(
@ -574,8 +578,9 @@ class OpenTelemetry(CustomLogger):
        logging_payload: ManagementEndpointLoggingPayload,
        parent_otel_span: Optional[Span] = None,
    ):
-        from opentelemetry import trace
        from datetime import datetime
+
+        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

        _start_time_ns = logging_payload.start_time
@ -619,8 +624,9 @@ class OpenTelemetry(CustomLogger):
        logging_payload: ManagementEndpointLoggingPayload,
        parent_otel_span: Optional[Span] = None,
    ):
-        from opentelemetry import trace
        from datetime import datetime
+
+        from opentelemetry import trace
        from opentelemetry.trace import Status, StatusCode

        _start_time_ns = logging_payload.start_time
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -8,6 +8,7 @@ import traceback
 import datetime, subprocess, sys
 import litellm, uuid
 from litellm._logging import print_verbose, verbose_logger
+from typing import Optional, Union


 class PrometheusLogger:
@ -17,33 +18,76 @@ class PrometheusLogger:
        **kwargs,
    ):
        try:
-            from prometheus_client import Counter
+            from prometheus_client import Counter, Gauge

            self.litellm_llm_api_failed_requests_metric = Counter(
                name="litellm_llm_api_failed_requests_metric",
                documentation="Total number of failed LLM API calls via litellm",
-                labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
+                labelnames=[
+                    "end_user",
+                    "hashed_api_key",
+                    "model",
+                    "team",
+                    "team_alias",
+                    "user",
+                ],
            )

            self.litellm_requests_metric = Counter(
                name="litellm_requests_metric",
                documentation="Total number of LLM calls to litellm",
-                labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
+                labelnames=[
+                    "end_user",
+                    "hashed_api_key",
+                    "model",
+                    "team",
+                    "team_alias",
+                    "user",
+                ],
            )

            # Counter for spend
            self.litellm_spend_metric = Counter(
                "litellm_spend_metric",
                "Total spend on LLM requests",
-                labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
+                labelnames=[
+                    "end_user",
+                    "hashed_api_key",
+                    "model",
+                    "team",
+                    "team_alias",
+                    "user",
+                ],
            )

            # Counter for total_output_tokens
            self.litellm_tokens_metric = Counter(
                "litellm_total_tokens",
                "Total number of input + output tokens from LLM requests",
-                labelnames=["end_user", "hashed_api_key", "model", "team", "user"],
+                labelnames=[
+                    "end_user",
+                    "hashed_api_key",
+                    "model",
+                    "team",
+                    "team_alias",
+                    "user",
+                ],
            )
+
+            # Remaining Budget for Team
+            self.litellm_remaining_team_budget_metric = Gauge(
+                "litellm_remaining_team_budget_metric",
+                "Remaining budget for team",
+                labelnames=["team_id", "team_alias"],
+            )
+
+            # Remaining Budget for API Key
+            self.litellm_remaining_api_key_budget_metric = Gauge(
+                "litellm_remaining_api_key_budget_metric",
+                "Remaining budget for api key",
+                labelnames=["hashed_api_key", "api_key_alias"],
+            )
+
        except Exception as e:
            print_verbose(f"Got exception on init prometheus client {str(e)}")
            raise e
@ -51,7 +95,9 @@ class PrometheusLogger:
    async def _async_log_event(
        self, kwargs, response_obj, start_time, end_time, print_verbose, user_id
    ):
-        self.log_event(kwargs, response_obj, start_time, end_time, print_verbose)
+        self.log_event(
+            kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        )

    def log_event(
        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -72,9 +118,36 @@ class PrometheusLogger:
                "user_api_key_user_id", None
            )
            user_api_key = litellm_params.get("metadata", {}).get("user_api_key", None)
+            user_api_key_alias = litellm_params.get("metadata", {}).get(
+                "user_api_key_alias", None
+            )
            user_api_team = litellm_params.get("metadata", {}).get(
                "user_api_key_team_id", None
            )
+            user_api_team_alias = litellm_params.get("metadata", {}).get(
+                "user_api_key_team_alias", None
+            )
+
+            _team_spend = litellm_params.get("metadata", {}).get(
+                "user_api_key_team_spend", None
+            )
+            _team_max_budget = litellm_params.get("metadata", {}).get(
+                "user_api_key_team_max_budget", None
+            )
+            _remaining_team_budget = safe_get_remaining_budget(
+                max_budget=_team_max_budget, spend=_team_spend
+            )
+
+            _api_key_spend = litellm_params.get("metadata", {}).get(
+                "user_api_key_spend", None
+            )
+            _api_key_max_budget = litellm_params.get("metadata", {}).get(
+                "user_api_key_max_budget", None
+            )
+            _remaining_api_key_budget = safe_get_remaining_budget(
+                max_budget=_api_key_max_budget, spend=_api_key_spend
+            )
+
            if response_obj is not None:
                tokens_used = response_obj.get("usage", {}).get("total_tokens", 0)
            else:
@ -94,19 +167,47 @@ class PrometheusLogger:
                user_api_key = hash_token(user_api_key)

            self.litellm_requests_metric.labels(
-                end_user_id, user_api_key, model, user_api_team, user_id
+                end_user_id,
+                user_api_key,
+                model,
+                user_api_team,
+                user_api_team_alias,
+                user_id,
            ).inc()
            self.litellm_spend_metric.labels(
-                end_user_id, user_api_key, model, user_api_team, user_id
+                end_user_id,
+                user_api_key,
+                model,
+                user_api_team,
+                user_api_team_alias,
+                user_id,
            ).inc(response_cost)
            self.litellm_tokens_metric.labels(
-                end_user_id, user_api_key, model, user_api_team, user_id
+                end_user_id,
+                user_api_key,
+                model,
+                user_api_team,
+                user_api_team_alias,
+                user_id,
            ).inc(tokens_used)

+            self.litellm_remaining_team_budget_metric.labels(
+                user_api_team, user_api_team_alias
+            ).set(_remaining_team_budget)
+
+            self.litellm_remaining_api_key_budget_metric.labels(
+                user_api_key, user_api_key_alias
+            ).set(_remaining_api_key_budget)
+
            ### FAILURE INCREMENT ###
            if "exception" in kwargs:
                self.litellm_llm_api_failed_requests_metric.labels(
-                    end_user_id, user_api_key, model, user_api_team, user_id
+                    end_user_id,
+                    user_api_key,
+                    model,
+                    user_api_team,
+                    user_api_team_alias,
+                    user_id,
                ).inc()
        except Exception as e:
            verbose_logger.error(
@ -114,3 +215,15 @@ class PrometheusLogger:
            )
            verbose_logger.debug(traceback.format_exc())
            pass
+
+
+def safe_get_remaining_budget(
+    max_budget: Optional[float], spend: Optional[float]
+) -> float:
+    if max_budget is None:
+        return float("inf")
+
+    if spend is None:
+        return max_budget
+
+    return max_budget - spend
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -330,6 +330,7 @@ class SlackAlerting(CustomLogger):
            messages = "Message not logged. litellm.redact_messages_in_exceptions=True"
        request_info = f"\nRequest Model: `{model}`\nAPI Base: `{api_base}`\nMessages: `{messages}`"
        slow_message = f"`Responses are slow - {round(time_difference_float,2)}s response time > Alerting threshold: {self.alerting_threshold}s`"
+        alerting_metadata: dict = {}
        if time_difference_float > self.alerting_threshold:
            # add deployment latencies to alert
            if (
@ -337,7 +338,7 @@ class SlackAlerting(CustomLogger):
                and "litellm_params" in kwargs
                and "metadata" in kwargs["litellm_params"]
            ):
-                _metadata = kwargs["litellm_params"]["metadata"]
+                _metadata: dict = kwargs["litellm_params"]["metadata"]
                request_info = litellm.utils._add_key_name_and_team_to_alert(
                    request_info=request_info, metadata=_metadata
                )
@ -349,10 +350,14 @@ class SlackAlerting(CustomLogger):
                    request_info += (
                        f"\nAvailable Deployment Latencies\n{_deployment_latency_map}"
                    )
+
+                if "alerting_metadata" in _metadata:
+                    alerting_metadata = _metadata["alerting_metadata"]
            await self.send_alert(
                message=slow_message + request_info,
                level="Low",
                alert_type="llm_too_slow",
+                alerting_metadata=alerting_metadata,
            )

    async def async_update_daily_reports(
@ -540,7 +545,12 @@ class SlackAlerting(CustomLogger):
        message += f"\n\nNext Run is at: `{time.time() + self.alerting_args.daily_report_frequency}`s"

        # send alert
-        await self.send_alert(message=message, level="Low", alert_type="daily_reports")
+        await self.send_alert(
+            message=message,
+            level="Low",
+            alert_type="daily_reports",
+            alerting_metadata={},
+        )

        return True

@ -582,6 +592,7 @@ class SlackAlerting(CustomLogger):
            await asyncio.sleep(
                self.alerting_threshold
            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
+            alerting_metadata: dict = {}
            if (
                request_data is not None
                and request_data.get("litellm_status", "") != "success"
@ -606,7 +617,7 @@ class SlackAlerting(CustomLogger):
                ):
                    # In hanging requests sometime it has not made it to the point where the deployment is passed to the `request_data``
                    # in that case we fallback to the api base set in the request metadata
-                    _metadata = request_data["metadata"]
+                    _metadata: dict = request_data["metadata"]
                    _api_base = _metadata.get("api_base", "")

                    request_info = litellm.utils._add_key_name_and_team_to_alert(
@ -615,6 +626,9 @@ class SlackAlerting(CustomLogger):

                    if _api_base is None:
                        _api_base = ""
+
+                    if "alerting_metadata" in _metadata:
+                        alerting_metadata = _metadata["alerting_metadata"]
                    request_info += f"\nAPI Base: `{_api_base}`"
                # only alert hanging responses if they have not been marked as success
                alerting_message = (
@ -640,6 +654,7 @@ class SlackAlerting(CustomLogger):
                    message=alerting_message + request_info,
                    level="Medium",
                    alert_type="llm_requests_hanging",
+                    alerting_metadata=alerting_metadata,
                )

    async def failed_tracking_alert(self, error_message: str):
@ -650,7 +665,10 @@ class SlackAlerting(CustomLogger):
        result = await _cache.async_get_cache(key=_cache_key)
        if result is None:
            await self.send_alert(
-                message=message, level="High", alert_type="budget_alerts"
+                message=message,
+                level="High",
+                alert_type="budget_alerts",
+                alerting_metadata={},
            )
            await _cache.async_set_cache(
                key=_cache_key,
@ -680,7 +698,7 @@ class SlackAlerting(CustomLogger):
            return
        if "budget_alerts" not in self.alert_types:
            return
-        _id: str = "default_id"  # used for caching
+        _id: Optional[str] = "default_id"  # used for caching
        user_info_json = user_info.model_dump(exclude_none=True)
        for k, v in user_info_json.items():
            user_info_str = "\n{}: {}\n".format(k, v)
@ -751,6 +769,7 @@ class SlackAlerting(CustomLogger):
                    level="High",
                    alert_type="budget_alerts",
                    user_info=webhook_event,
+                    alerting_metadata={},
                )
                await _cache.async_set_cache(
                    key=_cache_key,
@ -769,7 +788,13 @@ class SlackAlerting(CustomLogger):
        response_cost: Optional[float],
        max_budget: Optional[float],
    ):
-        if end_user_id is not None and token is not None and response_cost is not None:
+        if (
+            self.alerting is not None
+            and "webhook" in self.alerting
+            and end_user_id is not None
+            and token is not None
+            and response_cost is not None
+        ):
            # log customer spend
            event = WebhookEvent(
                spend=response_cost,
@ -941,7 +966,10 @@ class SlackAlerting(CustomLogger):
            )
            # send minor alert
            await self.send_alert(
-                message=msg, level="Medium", alert_type="outage_alerts"
+                message=msg,
+                level="Medium",
+                alert_type="outage_alerts",
+                alerting_metadata={},
            )
            # set to true
            outage_value["minor_alert_sent"] = True
@ -963,7 +991,12 @@ class SlackAlerting(CustomLogger):
            )

            # send minor alert
-            await self.send_alert(message=msg, level="High", alert_type="outage_alerts")
+            await self.send_alert(
+                message=msg,
+                level="High",
+                alert_type="outage_alerts",
+                alerting_metadata={},
+            )
            # set to true
            outage_value["major_alert_sent"] = True

@ -1062,7 +1095,10 @@ class SlackAlerting(CustomLogger):
                )
                # send minor alert
                await self.send_alert(
-                    message=msg, level="Medium", alert_type="outage_alerts"
+                    message=msg,
+                    level="Medium",
+                    alert_type="outage_alerts",
+                    alerting_metadata={},
                )
                # set to true
                outage_value["minor_alert_sent"] = True
@ -1081,7 +1117,10 @@ class SlackAlerting(CustomLogger):
                )
                # send minor alert
                await self.send_alert(
-                    message=msg, level="High", alert_type="outage_alerts"
+                    message=msg,
+                    level="High",
+                    alert_type="outage_alerts",
+                    alerting_metadata={},
                )
                # set to true
                outage_value["major_alert_sent"] = True
@ -1143,7 +1182,10 @@ Model Info:
 """

        alert_val = self.send_alert(
-            message=message, level="Low", alert_type="new_model_added"
+            message=message,
+            level="Low",
+            alert_type="new_model_added",
+            alerting_metadata={},
        )

        if alert_val is not None and asyncio.iscoroutine(alert_val):
@ -1159,6 +1201,9 @@ Model Info:
        Currently only implemented for budget alerts

        Returns -> True if sent, False if not.
+
+        Raises Exception
+            - if WEBHOOK_URL is not set
        """

        webhook_url = os.getenv("WEBHOOK_URL", None)
@ -1297,7 +1342,9 @@ Model Info:
            verbose_proxy_logger.error("Error sending email alert %s", str(e))
            return False

-    async def send_email_alert_using_smtp(self, webhook_event: WebhookEvent) -> bool:
+    async def send_email_alert_using_smtp(
+        self, webhook_event: WebhookEvent, alert_type: str
+    ) -> bool:
        """
        Sends structured Email alert to an SMTP server

@ -1306,7 +1353,6 @@ Model Info:
        Returns -> True if sent, False if not.
        """
        from litellm.proxy.utils import send_email
-
        from litellm.proxy.proxy_server import premium_user, prisma_client

        email_logo_url = os.getenv(
@ -1360,6 +1406,10 @@ Model Info:
            subject=email_event["subject"],
            html=email_event["html"],
        )
+        if webhook_event.event_group == "team":
+            from litellm.integrations.email_alerting import send_team_budget_alert
+
+            await send_team_budget_alert(webhook_event=webhook_event)

        return False

@ -1368,6 +1418,7 @@ Model Info:
        message: str,
        level: Literal["Low", "Medium", "High"],
        alert_type: Literal[AlertType],
+        alerting_metadata: dict,
        user_info: Optional[WebhookEvent] = None,
        **kwargs,
    ):
@ -1401,7 +1452,9 @@ Model Info:
            and user_info is not None
        ):
            # only send budget alerts over Email
-            await self.send_email_alert_using_smtp(webhook_event=user_info)
+            await self.send_email_alert_using_smtp(
+                webhook_event=user_info, alert_type=alert_type
+            )

        if "slack" not in self.alerting:
            return
@ -1425,6 +1478,9 @@ Model Info:
        if kwargs:
            for key, value in kwargs.items():
                formatted_message += f"\n\n{key}: `{value}`\n\n"
+        if alerting_metadata:
+            for key, value in alerting_metadata.items():
+                formatted_message += f"\n\n*Alerting Metadata*: \n{key}: `{value}`\n\n"
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"

@ -1440,7 +1496,7 @@ Model Info:
            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)

        if slack_webhook_url is None:
-            raise Exception("Missing SLACK_WEBHOOK_URL from environment")
+            raise ValueError("Missing SLACK_WEBHOOK_URL from environment")
        payload = {"text": formatted_message}
        headers = {"Content-type": "application/json"}

@ -1453,7 +1509,7 @@ Model Info:
            pass
        else:
            verbose_proxy_logger.debug(
-                "Error sending slack alert. Error=", response.text
+                "Error sending slack alert. Error={}".format(response.text)
            )

    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
@ -1622,6 +1678,7 @@ Model Info:
                message=_weekly_spend_message,
                level="Low",
                alert_type="spend_reports",
+                alerting_metadata={},
            )
        except Exception as e:
            verbose_proxy_logger.error("Error sending weekly spend report", e)
@ -1673,6 +1730,7 @@ Model Info:
                message=_spend_message,
                level="Low",
                alert_type="spend_reports",
+                alerting_metadata={},
            )
        except Exception as e:
            verbose_proxy_logger.error("Error sending weekly spend report", e)
--- a/litellm/litellm_core_utils/core_helpers.py
+++ b/litellm/litellm_core_utils/core_helpers.py
@ -0,0 +1,41 @@
+# What is this?
+## Helper utilities for the model response objects
+
+
+def map_finish_reason(
+    finish_reason: str,
+):  # openai supports 5 stop sequences - 'stop', 'length', 'function_call', 'content_filter', 'null'
+    # anthropic mapping
+    if finish_reason == "stop_sequence":
+        return "stop"
+    # cohere mapping - https://docs.cohere.com/reference/generate
+    elif finish_reason == "COMPLETE":
+        return "stop"
+    elif finish_reason == "MAX_TOKENS":  # cohere + vertex ai
+        return "length"
+    elif finish_reason == "ERROR_TOXIC":
+        return "content_filter"
+    elif (
+        finish_reason == "ERROR"
+    ):  # openai currently doesn't support an 'error' finish reason
+        return "stop"
+    # huggingface mapping https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate_stream
+    elif finish_reason == "eos_token" or finish_reason == "stop_sequence":
+        return "stop"
+    elif (
+        finish_reason == "FINISH_REASON_UNSPECIFIED" or finish_reason == "STOP"
+    ):  # vertex ai - got from running `print(dir(response_obj.candidates[0].finish_reason))`: ['FINISH_REASON_UNSPECIFIED', 'MAX_TOKENS', 'OTHER', 'RECITATION', 'SAFETY', 'STOP',]
+        return "stop"
+    elif finish_reason == "SAFETY":  # vertex ai
+        return "content_filter"
+    elif finish_reason == "STOP":  # vertex ai
+        return "stop"
+    elif finish_reason == "end_turn" or finish_reason == "stop_sequence":  # anthropic
+        return "stop"
+    elif finish_reason == "max_tokens":  # anthropic
+        return "length"
+    elif finish_reason == "tool_use":  # anthropic
+        return "tool_calls"
+    elif finish_reason == "content_filtered":
+        return "content_filter"
+    return finish_reason
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
--- a/litellm/litellm_core_utils/llm_cost_calc/google.py
+++ b/litellm/litellm_core_utils/llm_cost_calc/google.py
@ -0,0 +1,210 @@
+# What is this?
+## Cost calculation for Google AI Studio / Vertex AI models
+import traceback
+from typing import List, Literal, Optional, Tuple
+
+import litellm
+from litellm import verbose_logger
+
+"""
+Gemini pricing covers: 
+- token
+- image
+- audio
+- video
+"""
+
+"""
+Vertex AI -> character based pricing 
+
+Google AI Studio -> token based pricing
+"""
+
+models_without_dynamic_pricing = ["gemini-1.0-pro", "gemini-pro"]
+
+
+def _is_above_128k(tokens: float) -> bool:
+    if tokens > 128000:
+        return True
+    return False
+
+
+def cost_per_character(
+    model: str,
+    custom_llm_provider: str,
+    prompt_tokens: float,
+    completion_tokens: float,
+    prompt_characters: float,
+    completion_characters: float,
+) -> Tuple[float, float]:
+    """
+    Calculates the cost per character for a given VertexAI model, input messages, and response object.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - custom_llm_provider: str, "vertex_ai-*"
+        - prompt_characters: float, the number of input characters
+        - completion_characters: float, the number of output characters
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+
+    Raises:
+        Exception if model requires >128k pricing, but model cost not mapped
+    """
+    model_info = litellm.get_model_info(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    ## GET MODEL INFO
+    model_info = litellm.get_model_info(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    ## CALCULATE INPUT COST
+    try:
+        if (
+            _is_above_128k(tokens=prompt_characters * 4)  # 1 token = 4 char
+            and model not in models_without_dynamic_pricing
+        ):
+            ## check if character pricing, else default to token pricing
+            assert (
+                "input_cost_per_character_above_128k_tokens" in model_info
+                and model_info["input_cost_per_character_above_128k_tokens"] is not None
+            ), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
+                model, model_info
+            )
+            prompt_cost = (
+                prompt_characters
+                * model_info["input_cost_per_character_above_128k_tokens"]
+            )
+        else:
+            assert (
+                "input_cost_per_character" in model_info
+                and model_info["input_cost_per_character"] is not None
+            ), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
+                model, model_info
+            )
+            prompt_cost = prompt_characters * model_info["input_cost_per_character"]
+    except Exception as e:
+        verbose_logger.error(
+            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
+                Defaulting to (cost_per_token * 4) calculation for prompt_cost".format(
+                str(e), traceback.format_exc()
+            )
+        )
+        initial_prompt_cost, _ = cost_per_token(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        )
+
+        prompt_cost = initial_prompt_cost * 4
+
+    ## CALCULATE OUTPUT COST
+    try:
+        if (
+            _is_above_128k(tokens=completion_characters * 4)  # 1 token = 4 char
+            and model not in models_without_dynamic_pricing
+        ):
+            assert (
+                "output_cost_per_character_above_128k_tokens" in model_info
+                and model_info["output_cost_per_character_above_128k_tokens"]
+                is not None
+            ), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
+                model, model_info
+            )
+            completion_cost = (
+                completion_tokens
+                * model_info["output_cost_per_character_above_128k_tokens"]
+            )
+        else:
+            assert (
+                "output_cost_per_character" in model_info
+                and model_info["output_cost_per_character"] is not None
+            ), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
+                model, model_info
+            )
+            completion_cost = (
+                completion_tokens * model_info["output_cost_per_character"]
+            )
+    except Exception as e:
+        verbose_logger.error(
+            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured - {}\n{}\n\
+                Defaulting to (cost_per_token * 4) calculation for completion_cost".format(
+                str(e), traceback.format_exc()
+            )
+        )
+        _, initial_completion_cost = cost_per_token(
+            model=model,
+            custom_llm_provider=custom_llm_provider,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+        )
+
+        completion_cost = initial_completion_cost * 4
+    return prompt_cost, completion_cost
+
+
+def cost_per_token(
+    model: str,
+    custom_llm_provider: str,
+    prompt_tokens: float,
+    completion_tokens: float,
+) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - custom_llm_provider: str, either "vertex_ai-*" or "gemini"
+        - prompt_tokens: float, the number of input tokens
+        - completion_tokens: float, the number of output tokens
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+
+    Raises:
+        Exception if model requires >128k pricing, but model cost not mapped
+    """
+    ## GET MODEL INFO
+    model_info = litellm.get_model_info(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    ## CALCULATE INPUT COST
+    if (
+        _is_above_128k(tokens=prompt_tokens)
+        and model not in models_without_dynamic_pricing
+    ):
+        assert (
+            "input_cost_per_token_above_128k_tokens" in model_info
+            and model_info["input_cost_per_token_above_128k_tokens"] is not None
+        ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
+            model, model_info
+        )
+        prompt_cost = (
+            prompt_tokens * model_info["input_cost_per_token_above_128k_tokens"]
+        )
+    else:
+        prompt_cost = prompt_tokens * model_info["input_cost_per_token"]
+
+    ## CALCULATE OUTPUT COST
+    if (
+        _is_above_128k(tokens=completion_tokens)
+        and model not in models_without_dynamic_pricing
+    ):
+        assert (
+            "output_cost_per_token_above_128k_tokens" in model_info
+            and model_info["output_cost_per_token_above_128k_tokens"] is not None
+        ), "model info for model={} does not have pricing for > 128k tokens\nmodel_info={}".format(
+            model, model_info
+        )
+        completion_cost = (
+            completion_tokens * model_info["output_cost_per_token_above_128k_tokens"]
+        )
+    else:
+        completion_cost = completion_tokens * model_info["output_cost_per_token"]
+
+    return prompt_cost, completion_cost
--- a/litellm/litellm_core_utils/llm_request_utils.py
+++ b/litellm/litellm_core_utils/llm_request_utils.py
@ -0,0 +1,28 @@
+from typing import Dict, Optional
+
+
+def _ensure_extra_body_is_safe(extra_body: Optional[Dict]) -> Optional[Dict]:
+    """
+    Ensure that the extra_body sent in the request is safe,  otherwise users will see this error
+
+    "Object of type TextPromptClient is not JSON serializable
+
+
+    Relevant Issue: https://github.com/BerriAI/litellm/issues/4140
+    """
+    if extra_body is None:
+        return None
+
+    if not isinstance(extra_body, dict):
+        return extra_body
+
+    if "metadata" in extra_body and isinstance(extra_body["metadata"], dict):
+        if "prompt" in extra_body["metadata"]:
+            _prompt = extra_body["metadata"].get("prompt")
+
+            # users can send Langfuse TextPromptClient objects, so we need to convert them to dicts
+            # Langfuse TextPromptClients have .__dict__ attribute
+            if _prompt is not None and hasattr(_prompt, "__dict__"):
+                extra_body["metadata"]["prompt"] = _prompt.__dict__
+
+    return extra_body
--- a/litellm/litellm_core_utils/redact_messages.py
+++ b/litellm/litellm_core_utils/redact_messages.py
@ -0,0 +1,71 @@
+# +-----------------------------------------------+
+# |                                               |
+# |           Give Feedback / Get Help            |
+# | https://github.com/BerriAI/litellm/issues/new |
+# |                                               |
+# +-----------------------------------------------+
+#
+#  Thank you users! We ❤️ you! - Krrish & Ishaan
+
+import copy
+from typing import TYPE_CHECKING, Any
+import litellm
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import (
+        Logging as _LiteLLMLoggingObject,
+    )
+
+    LiteLLMLoggingObject = _LiteLLMLoggingObject
+else:
+    LiteLLMLoggingObject = Any
+
+
+def redact_message_input_output_from_logging(
+    litellm_logging_obj: LiteLLMLoggingObject, result
+):
+    """
+    Removes messages, prompts, input, response from logging. This modifies the data in-place
+    only redacts when litellm.turn_off_message_logging == True
+    """
+    # check if user opted out of logging message/response to callbacks
+    if litellm.turn_off_message_logging is not True:
+        return result
+
+    # remove messages, prompts, input, response from logging
+    litellm_logging_obj.model_call_details["messages"] = [
+        {"role": "user", "content": "redacted-by-litellm"}
+    ]
+    litellm_logging_obj.model_call_details["prompt"] = ""
+    litellm_logging_obj.model_call_details["input"] = ""
+
+    # response cleaning
+    # ChatCompletion Responses
+    if (
+        litellm_logging_obj.stream is True
+        and "complete_streaming_response" in litellm_logging_obj.model_call_details
+    ):
+        _streaming_response = litellm_logging_obj.model_call_details[
+            "complete_streaming_response"
+        ]
+        for choice in _streaming_response.choices:
+            if isinstance(choice, litellm.Choices):
+                choice.message.content = "redacted-by-litellm"
+            elif isinstance(choice, litellm.utils.StreamingChoices):
+                choice.delta.content = "redacted-by-litellm"
+    else:
+        if result is not None:
+            if isinstance(result, litellm.ModelResponse):
+                # only deep copy litellm.ModelResponse
+                _result = copy.deepcopy(result)
+                if hasattr(_result, "choices") and _result.choices is not None:
+                    for choice in _result.choices:
+                        if isinstance(choice, litellm.Choices):
+                            choice.message.content = "redacted-by-litellm"
+                        elif isinstance(choice, litellm.utils.StreamingChoices):
+                            choice.delta.content = "redacted-by-litellm"
+
+                return _result
+
+    # by default return result
+    return result
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -5,10 +5,16 @@ import requests, copy  # type: ignore
 import time
 from functools import partial
 from typing import Callable, Optional, List, Union
-from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
+import litellm.litellm_core_utils
+from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    _get_async_httpx_client,
+    _get_httpx_client,
+)
 from .base import BaseLLM
 import httpx  # type: ignore
 from litellm.types.llms.anthropic import AnthropicMessagesToolChoice
@ -171,7 +177,7 @@ async def make_call(
    logging_obj,
 ):
    if client is None:
-        client = AsyncHTTPHandler()  # Create a new client if none provided
+        client = _get_async_httpx_client()  # Create a new client if none provided

    response = await client.post(api_base, headers=headers, data=data, stream=True)

@ -201,7 +207,7 @@ class AnthropicChatCompletion(BaseLLM):
        response: Union[requests.Response, httpx.Response],
        model_response: ModelResponse,
        stream: bool,
-        logging_obj: litellm.utils.Logging,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
@ -316,7 +322,7 @@ class AnthropicChatCompletion(BaseLLM):
        response: Union[requests.Response, httpx.Response],
        model_response: ModelResponse,
        stream: bool,
-        logging_obj: litellm.utils.Logging,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
@ -463,9 +469,7 @@ class AnthropicChatCompletion(BaseLLM):
        logger_fn=None,
        headers={},
    ) -> Union[ModelResponse, CustomStreamWrapper]:
-        async_handler = AsyncHTTPHandler(
-            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
-        )
+        async_handler = _get_async_httpx_client()
        response = await async_handler.post(api_base, headers=headers, json=data)
        if stream and _is_function_call:
            return self.process_streaming_response(
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -1,42 +1,56 @@
-from typing import Optional, Union, Any, Literal, Coroutine, Iterable
-from typing_extensions import overload
-import types, requests
-from .base import BaseLLM
-from litellm.utils import (
-    ModelResponse,
-    Choices,
-    Message,
-    CustomStreamWrapper,
-    convert_to_model_response_object,
-    TranscriptionResponse,
-    get_secret,
-    UnsupportedParamsError,
-)
-from typing import Callable, Optional, BinaryIO, List
-from litellm import OpenAIConfig
-import litellm, json
-import httpx  # type: ignore
-from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
-from openai import AzureOpenAI, AsyncAzureOpenAI
-import uuid
+import asyncio
+import json
 import os
+import types
+import uuid
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    Coroutine,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Union,
+)
+
+import httpx  # type: ignore
+import requests
+from openai import AsyncAzureOpenAI, AzureOpenAI
+from typing_extensions import overload
+
+import litellm
+from litellm import OpenAIConfig
+from litellm.caching import DualCache
+from litellm.utils import (
+    Choices,
+    CustomStreamWrapper,
+    Message,
+    ModelResponse,
+    TranscriptionResponse,
+    UnsupportedParamsError,
+    convert_to_model_response_object,
+    get_secret,
+)
+
 from ..types.llms.openai import (
-    AsyncCursorPage,
-    AssistantToolParam,
-    SyncCursorPage,
    Assistant,
-    MessageData,
-    OpenAIMessage,
-    OpenAICreateThreadParamsMessage,
-    Thread,
-    AssistantToolParam,
-    Run,
    AssistantEventHandler,
+    AssistantStreamManager,
+    AssistantToolParam,
    AsyncAssistantEventHandler,
    AsyncAssistantStreamManager,
-    AssistantStreamManager,
+    AsyncCursorPage,
+    MessageData,
+    OpenAICreateThreadParamsMessage,
+    OpenAIMessage,
+    Run,
+    SyncCursorPage,
+    Thread,
 )
-from litellm.caching import DualCache
+from .base import BaseLLM
+from .custom_httpx.azure_dall_e_2 import AsyncCustomHTTPTransport, CustomHTTPTransport

 azure_ad_cache = DualCache()

@ -313,7 +327,9 @@ def select_azure_base_url_or_endpoint(azure_client_params: dict):
 def get_azure_ad_token_from_oidc(azure_ad_token: str):
    azure_client_id = os.getenv("AZURE_CLIENT_ID", None)
    azure_tenant_id = os.getenv("AZURE_TENANT_ID", None)
-    azure_authority_host = os.getenv("AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com")
+    azure_authority_host = os.getenv(
+        "AZURE_AUTHORITY_HOST", "https://login.microsoftonline.com"
+    )

    if azure_client_id is None or azure_tenant_id is None:
        raise AzureOpenAIError(
@ -329,12 +345,14 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
            message="OIDC token could not be retrieved from secret manager.",
        )

-    azure_ad_token_cache_key = json.dumps({
-        "azure_client_id": azure_client_id,
-        "azure_tenant_id": azure_tenant_id,
-        "azure_authority_host": azure_authority_host,
-        "oidc_token": oidc_token,
-    })
+    azure_ad_token_cache_key = json.dumps(
+        {
+            "azure_client_id": azure_client_id,
+            "azure_tenant_id": azure_tenant_id,
+            "azure_authority_host": azure_authority_host,
+            "oidc_token": oidc_token,
+        }
+    )

    azure_ad_token_access_token = azure_ad_cache.get_cache(azure_ad_token_cache_key)
    if azure_ad_token_access_token is not None:
@ -371,7 +389,11 @@ def get_azure_ad_token_from_oidc(azure_ad_token: str):
            status_code=422, message="Azure AD Token expires_in not returned"
        )

-    azure_ad_cache.set_cache(key=azure_ad_token_cache_key, value=azure_ad_token_access_token, ttl=azure_ad_token_expires_in)
+    azure_ad_cache.set_cache(
+        key=azure_ad_token_cache_key,
+        value=azure_ad_token_access_token,
+        ttl=azure_ad_token_expires_in,
+    )

    return azure_ad_token_access_token

@ -645,6 +667,8 @@ class AzureChatCompletion(BaseLLM):
        except AzureOpenAIError as e:
            exception_mapping_worked = True
            raise e
+        except asyncio.CancelledError as e:
+            raise AzureOpenAIError(status_code=500, message=str(e))
        except Exception as e:
            if hasattr(e, "status_code"):
                raise e
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@ -2,7 +2,7 @@
 import litellm
 import httpx, requests
 from typing import Optional, Union
-from litellm.utils import Logging
+from litellm.litellm_core_utils.litellm_logging import Logging


 class BaseLLM:
@ -27,6 +27,25 @@ class BaseLLM:
        """
        return model_response

+    def process_text_completion_response(
+        self,
+        model: str,
+        response: Union[requests.Response, httpx.Response],
+        model_response: litellm.utils.TextCompletionResponse,
+        stream: bool,
+        logging_obj: Logging,
+        optional_params: dict,
+        api_key: str,
+        data: Union[dict, str],
+        messages: list,
+        print_verbose,
+        encoding,
+    ) -> Union[litellm.utils.TextCompletionResponse, litellm.utils.CustomStreamWrapper]:
+        """
+        Helper function to process the response across sync + async completion calls
+        """
+        return model_response
+
    def create_client_session(self):
        if litellm.client_session:
            _client_session = litellm.client_session
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -1,25 +1,27 @@
-import json, copy, types
+import copy
+import json
 import os
+import time
+import types
+import uuid
 from enum import Enum
-import time, uuid
-from typing import Callable, Optional, Any, Union, List
+from typing import Any, Callable, List, Optional, Union
+
+import httpx
+
 import litellm
-from litellm.utils import (
-    ModelResponse,
-    get_secret,
-    Usage,
-    ImageResponse,
-    map_finish_reason,
-)
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.types.utils import ImageResponse, ModelResponse, Usage
+from litellm.utils import get_secret
+
 from .prompt_templates.factory import (
-    prompt_factory,
-    custom_prompt,
    construct_tool_use_system_prompt,
+    contains_tag,
+    custom_prompt,
    extract_between_tags,
    parse_xml_params,
-    contains_tag,
+    prompt_factory,
 )
-import httpx


 class BedrockError(Exception):
@ -633,7 +635,11 @@ def init_bedrock_client(
        config = boto3.session.Config()

    ### CHECK STS ###
-    if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
+    if (
+        aws_web_identity_token is not None
+        and aws_role_name is not None
+        and aws_session_name is not None
+    ):
        oidc_token = get_secret(aws_web_identity_token)

        if oidc_token is None:
@ -642,9 +648,7 @@ def init_bedrock_client(
                status_code=401,
            )

-        sts_client = boto3.client(
-            "sts"
-        )
+        sts_client = boto3.client("sts")

        # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
        # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts/client/assume_role_with_web_identity.html
@ -726,38 +730,31 @@ def init_bedrock_client(

 def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
    # handle anthropic prompts and amazon titan prompts
-    if provider == "anthropic" or provider == "amazon":
-        if model in custom_prompt_dict:
-            # check if the model has a registered custom prompt
-            model_prompt_details = custom_prompt_dict[model]
-            prompt = custom_prompt(
-                role_dict=model_prompt_details["roles"],
-                initial_prompt_value=model_prompt_details["initial_prompt_value"],
-                final_prompt_value=model_prompt_details["final_prompt_value"],
-                messages=messages,
-            )
-        else:
+    chat_template_provider = ["anthropic", "amazon", "mistral", "meta"]
+    if model in custom_prompt_dict:
+        # check if the model has a registered custom prompt
+        model_prompt_details = custom_prompt_dict[model]
+        prompt = custom_prompt(
+            role_dict=model_prompt_details["roles"],
+            initial_prompt_value=model_prompt_details["initial_prompt_value"],
+            final_prompt_value=model_prompt_details["final_prompt_value"],
+            messages=messages,
+        )
+    else:
+        if provider in chat_template_provider:
            prompt = prompt_factory(
                model=model, messages=messages, custom_llm_provider="bedrock"
            )
-    elif provider == "mistral":
-        prompt = prompt_factory(
-            model=model, messages=messages, custom_llm_provider="bedrock"
-        )
-    elif provider == "meta":
-        prompt = prompt_factory(
-            model=model, messages=messages, custom_llm_provider="bedrock"
-        )
-    else:
-        prompt = ""
-        for message in messages:
-            if "role" in message:
-                if message["role"] == "user":
-                    prompt += f"{message['content']}"
+        else:
+            prompt = ""
+            for message in messages:
+                if "role" in message:
+                    if message["role"] == "user":
+                        prompt += f"{message['content']}"
+                    else:
+                        prompt += f"{message['content']}"
                else:
                    prompt += f"{message['content']}"
-            else:
-                prompt += f"{message['content']}"
    return prompt


--- a/litellm/llms/bedrock_httpx.py
+++ b/litellm/llms/bedrock_httpx.py
@ -22,13 +22,12 @@ from typing import (
 from litellm.utils import (
    ModelResponse,
    Usage,
-    map_finish_reason,
    CustomStreamWrapper,
-    Message,
-    Choices,
    get_secret,
-    Logging,
 )
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.litellm_core_utils.litellm_logging import Logging
+from litellm.types.utils import Message, Choices
 import litellm, uuid
 from .prompt_templates.factory import (
    prompt_factory,
@ -41,7 +40,12 @@ from .prompt_templates.factory import (
    _bedrock_converse_messages_pt,
    _bedrock_tools_pt,
 )
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    HTTPHandler,
+    _get_async_httpx_client,
+    _get_httpx_client,
+)
 from .base import BaseLLM
 import httpx  # type: ignore
 from .bedrock import BedrockError, convert_messages_to_prompt, ModelResponseIterator
@ -57,6 +61,7 @@ from litellm.caching import DualCache

 iam_cache = DualCache()

+
 class AmazonCohereChatConfig:
    """
    Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-cohere-command-r-plus.html
@ -167,7 +172,7 @@ async def make_call(
    logging_obj,
 ):
    if client is None:
-        client = AsyncHTTPHandler()  # Create a new client if none provided
+        client = _get_async_httpx_client()  # Create a new client if none provided

    response = await client.post(api_base, headers=headers, data=data, stream=True)

@ -198,7 +203,7 @@ def make_sync_call(
    logging_obj,
 ):
    if client is None:
-        client = HTTPHandler()  # Create a new client if none provided
+        client = _get_httpx_client()  # Create a new client if none provided

    response = client.post(api_base, headers=headers, data=data, stream=True)

@ -327,13 +332,19 @@ class BedrockLLM(BaseLLM):
        ) = params_to_check

        ### CHECK STS ###
-        if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
-            iam_creds_cache_key = json.dumps({
-                "aws_web_identity_token": aws_web_identity_token,
-                "aws_role_name": aws_role_name,
-                "aws_session_name": aws_session_name,
-                "aws_region_name": aws_region_name,
-            })
+        if (
+            aws_web_identity_token is not None
+            and aws_role_name is not None
+            and aws_session_name is not None
+        ):
+            iam_creds_cache_key = json.dumps(
+                {
+                    "aws_web_identity_token": aws_web_identity_token,
+                    "aws_role_name": aws_role_name,
+                    "aws_session_name": aws_session_name,
+                    "aws_region_name": aws_region_name,
+                }
+            )

            iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
            if iam_creds_dict is None:
@ -348,7 +359,7 @@ class BedrockLLM(BaseLLM):
                sts_client = boto3.client(
                    "sts",
                    region_name=aws_region_name,
-                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
+                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
                )

                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -362,12 +373,18 @@ class BedrockLLM(BaseLLM):

                iam_creds_dict = {
                    "aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
-                    "aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
+                    "aws_secret_access_key": sts_response["Credentials"][
+                        "SecretAccessKey"
+                    ],
                    "aws_session_token": sts_response["Credentials"]["SessionToken"],
                    "region_name": aws_region_name,
                }

-                iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
+                iam_cache.set_cache(
+                    key=iam_creds_cache_key,
+                    value=json.dumps(iam_creds_dict),
+                    ttl=3600 - 60,
+                )

            session = boto3.Session(**iam_creds_dict)

@ -976,7 +993,7 @@ class BedrockLLM(BaseLLM):
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
-            self.client = HTTPHandler(**_params)  # type: ignore
+            self.client = _get_httpx_client(_params)  # type: ignore
        else:
            self.client = client
        if (stream is not None and stream == True) and provider != "ai21":
@ -1058,7 +1075,7 @@ class BedrockLLM(BaseLLM):
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
-            client = AsyncHTTPHandler(**_params)  # type: ignore
+            client = _get_async_httpx_client(_params)  # type: ignore
        else:
            client = client  # type: ignore

@ -1433,13 +1450,19 @@ class BedrockConverseLLM(BaseLLM):
        ) = params_to_check

        ### CHECK STS ###
-        if aws_web_identity_token is not None and aws_role_name is not None and aws_session_name is not None:
-            iam_creds_cache_key = json.dumps({
-                "aws_web_identity_token": aws_web_identity_token,
-                "aws_role_name": aws_role_name,
-                "aws_session_name": aws_session_name,
-                "aws_region_name": aws_region_name,
-            })
+        if (
+            aws_web_identity_token is not None
+            and aws_role_name is not None
+            and aws_session_name is not None
+        ):
+            iam_creds_cache_key = json.dumps(
+                {
+                    "aws_web_identity_token": aws_web_identity_token,
+                    "aws_role_name": aws_role_name,
+                    "aws_session_name": aws_session_name,
+                    "aws_region_name": aws_region_name,
+                }
+            )

            iam_creds_dict = iam_cache.get_cache(iam_creds_cache_key)
            if iam_creds_dict is None:
@ -1454,7 +1477,7 @@ class BedrockConverseLLM(BaseLLM):
                sts_client = boto3.client(
                    "sts",
                    region_name=aws_region_name,
-                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com"
+                    endpoint_url=f"https://sts.{aws_region_name}.amazonaws.com",
                )

                # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html
@ -1468,12 +1491,18 @@ class BedrockConverseLLM(BaseLLM):

                iam_creds_dict = {
                    "aws_access_key_id": sts_response["Credentials"]["AccessKeyId"],
-                    "aws_secret_access_key": sts_response["Credentials"]["SecretAccessKey"],
+                    "aws_secret_access_key": sts_response["Credentials"][
+                        "SecretAccessKey"
+                    ],
                    "aws_session_token": sts_response["Credentials"]["SessionToken"],
                    "region_name": aws_region_name,
                }

-                iam_cache.set_cache(key=iam_creds_cache_key, value=json.dumps(iam_creds_dict), ttl=3600 - 60)
+                iam_cache.set_cache(
+                    key=iam_creds_cache_key,
+                    value=json.dumps(iam_creds_dict),
+                    ttl=3600 - 60,
+                )

            session = boto3.Session(**iam_creds_dict)

@ -1575,7 +1604,7 @@ class BedrockConverseLLM(BaseLLM):
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
-            client = AsyncHTTPHandler(**_params)  # type: ignore
+            client = _get_async_httpx_client(_params)  # type: ignore
        else:
            client = client  # type: ignore

@ -1847,7 +1876,7 @@ class BedrockConverseLLM(BaseLLM):
                if isinstance(timeout, float) or isinstance(timeout, int):
                    timeout = httpx.Timeout(timeout)
                _params["timeout"] = timeout
-            client = HTTPHandler(**_params)  # type: ignore
+            client = _get_httpx_client(_params)  # type: ignore
        else:
            client = client
        try:
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -12,6 +12,15 @@ class AsyncHTTPHandler:
        timeout: Optional[Union[float, httpx.Timeout]] = None,
        concurrent_limit=1000,
    ):
+        self.timeout = timeout
+        self.client = self.create_client(
+            timeout=timeout, concurrent_limit=concurrent_limit
+        )
+
+    def create_client(
+        self, timeout: Optional[Union[float, httpx.Timeout]], concurrent_limit: int
+    ) -> httpx.AsyncClient:
+
        async_proxy_mounts = None
        # Check if the HTTP_PROXY and HTTPS_PROXY environment variables are set and use them accordingly.
        http_proxy = os.getenv("HTTP_PROXY", None)
@ -39,7 +48,8 @@ class AsyncHTTPHandler:
        if timeout is None:
            timeout = _DEFAULT_TIMEOUT
        # Create a client with a connection pool
-        self.client = httpx.AsyncClient(
+
+        return httpx.AsyncClient(
            timeout=timeout,
            limits=httpx.Limits(
                max_connections=concurrent_limit,
@ -83,11 +93,48 @@ class AsyncHTTPHandler:
            response = await self.client.send(req, stream=stream)
            response.raise_for_status()
            return response
+        except httpx.RemoteProtocolError:
+            # Retry the request with a new session if there is a connection error
+            new_client = self.create_client(timeout=self.timeout, concurrent_limit=1)
+            try:
+                return await self.single_connection_post_request(
+                    url=url,
+                    client=new_client,
+                    data=data,
+                    json=json,
+                    params=params,
+                    headers=headers,
+                    stream=stream,
+                )
+            finally:
+                await new_client.aclose()
        except httpx.HTTPStatusError as e:
            raise e
        except Exception as e:
            raise e

+    async def single_connection_post_request(
+        self,
+        url: str,
+        client: httpx.AsyncClient,
+        data: Optional[Union[dict, str]] = None,  # type: ignore
+        json: Optional[dict] = None,
+        params: Optional[dict] = None,
+        headers: Optional[dict] = None,
+        stream: bool = False,
+    ):
+        """
+        Making POST request for a single connection client.
+
+        Used for retrying connection client errors.
+        """
+        req = client.build_request(
+            "POST", url, data=data, json=json, params=params, headers=headers  # type: ignore
+        )
+        response = await client.send(req, stream=stream)
+        response.raise_for_status()
+        return response
+
    def __del__(self) -> None:
        try:
            asyncio.get_running_loop().create_task(self.close())
@ -172,3 +219,60 @@ class HTTPHandler:
            self.close()
        except Exception:
            pass
+
+
+def _get_async_httpx_client(params: Optional[dict] = None) -> AsyncHTTPHandler:
+    """
+    Retrieves the async HTTP client from the cache
+    If not present, creates a new client
+
+    Caches the new client and returns it.
+    """
+    _params_key_name = ""
+    if params is not None:
+        for key, value in params.items():
+            try:
+                _params_key_name += f"{key}_{value}"
+            except Exception:
+                pass
+
+    _cache_key_name = "async_httpx_client" + _params_key_name
+    if _cache_key_name in litellm.in_memory_llm_clients_cache:
+        return litellm.in_memory_llm_clients_cache[_cache_key_name]
+
+    if params is not None:
+        _new_client = AsyncHTTPHandler(**params)
+    else:
+        _new_client = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+    litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
+    return _new_client
+
+
+def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler:
+    """
+    Retrieves the HTTP client from the cache
+    If not present, creates a new client
+
+    Caches the new client and returns it.
+    """
+    _params_key_name = ""
+    if params is not None:
+        for key, value in params.items():
+            try:
+                _params_key_name += f"{key}_{value}"
+            except Exception:
+                pass
+
+    _cache_key_name = "httpx_client" + _params_key_name
+    if _cache_key_name in litellm.in_memory_llm_clients_cache:
+        return litellm.in_memory_llm_clients_cache[_cache_key_name]
+
+    if params is not None:
+        _new_client = HTTPHandler(**params)
+    else:
+        _new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0))
+
+    litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client
+    return _new_client
--- a/litellm/llms/databricks.py
+++ b/litellm/llms/databricks.py
@ -10,10 +10,10 @@ from typing import Callable, Optional, List, Union, Tuple, Literal
 from litellm.utils import (
    ModelResponse,
    Usage,
-    map_finish_reason,
    CustomStreamWrapper,
    EmbeddingResponse,
 )
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
@ -289,7 +289,7 @@ class DatabricksChatCompletion(BaseLLM):
        response: Union[requests.Response, httpx.Response],
        model_response: ModelResponse,
        stream: bool,
-        logging_obj: litellm.utils.Logging,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
--- a/litellm/llms/gemini.py
+++ b/litellm/llms/gemini.py
@ -1,14 +1,22 @@
-import types
-import traceback
+####################################
+######### DEPRECATED FILE ##########
+####################################
+# logic moved to `vertex_httpx.py` #
+
 import copy
 import time
+import traceback
+import types
 from typing import Callable, Optional
-from litellm.utils import ModelResponse, Choices, Message, Usage
-import litellm
+
 import httpx
-from .prompt_templates.factory import prompt_factory, custom_prompt, get_system_prompt
 from packaging.version import Version
+
+import litellm
 from litellm import verbose_logger
+from litellm.utils import Choices, Message, ModelResponse, Usage
+
+from .prompt_templates.factory import custom_prompt, get_system_prompt, prompt_factory


 class GeminiError(Exception):
@ -186,8 +194,8 @@ def completion(
        if _system_instruction and len(system_prompt) > 0:
            _params["system_instruction"] = system_prompt
        _model = genai.GenerativeModel(**_params)
-        if stream == True:
-            if acompletion == True:
+        if stream is True:
+            if acompletion is True:

                async def async_streaming():
                    try:
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -1,33 +1,41 @@
+import hashlib
+import json
+import time
+import traceback
+import types
 from typing import (
-    Optional,
-    Union,
    Any,
    BinaryIO,
-    Literal,
+    Callable,
+    Coroutine,
    Iterable,
+    Literal,
+    Optional,
+    Union,
 )
-import hashlib
-from typing_extensions import override, overload
-from pydantic import BaseModel
-import types, time, json, traceback
+
 import httpx
-from .base import BaseLLM
-from litellm.utils import (
-    ModelResponse,
-    Choices,
-    Message,
-    CustomStreamWrapper,
-    convert_to_model_response_object,
-    Usage,
-    TranscriptionResponse,
-    TextCompletionResponse,
-)
-from typing import Callable, Optional, Coroutine
-import litellm
-from .prompt_templates.factory import prompt_factory, custom_prompt
-from openai import OpenAI, AsyncOpenAI
-from ..types.llms.openai import *
 import openai
+from openai import AsyncOpenAI, OpenAI
+from pydantic import BaseModel
+from typing_extensions import overload, override
+
+import litellm
+from litellm.types.utils import ProviderField
+from litellm.utils import (
+    Choices,
+    CustomStreamWrapper,
+    Message,
+    ModelResponse,
+    TextCompletionResponse,
+    TranscriptionResponse,
+    Usage,
+    convert_to_model_response_object,
+)
+
+from ..types.llms.openai import *
+from .base import BaseLLM
+from .prompt_templates.factory import custom_prompt, prompt_factory


 class OpenAIError(Exception):
@ -207,6 +215,25 @@ class MistralEmbeddingConfig:
        return optional_params


+class AzureAIStudioConfig:
+    def get_required_params(self) -> List[ProviderField]:
+        """For a given provider, return it's required fields with a description"""
+        return [
+            ProviderField(
+                field_name="api_key",
+                field_type="string",
+                field_description="Your Azure AI Studio API Key.",
+                field_value="zEJ...",
+            ),
+            ProviderField(
+                field_name="api_base",
+                field_type="string",
+                field_description="Your Azure AI Studio API Base.",
+                field_value="https://Mistral-serverless.",
+            ),
+        ]
+
+
 class DeepInfraConfig:
    """
    Reference: https://deepinfra.com/docs/advanced/openai_api
@ -286,8 +313,12 @@ class DeepInfraConfig:
        ]

    def map_openai_params(
-        self, non_default_params: dict, optional_params: dict, model: str
-    ):
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ) -> dict:
        supported_openai_params = self.get_supported_openai_params()
        for param, value in non_default_params.items():
            if (
@ -296,8 +327,23 @@ class DeepInfraConfig:
                and model == "mistralai/Mistral-7B-Instruct-v0.1"
            ):  # this model does no support temperature == 0
                value = 0.0001  # close to 0
+            if param == "tool_choice":
+                if (
+                    value != "auto" and value != "none"
+                ):  # https://deepinfra.com/docs/advanced/function_calling
+                    ## UNSUPPORTED TOOL CHOICE VALUE
+                    if litellm.drop_params is True or drop_params is True:
+                        value = None
+                    else:
+                        raise litellm.utils.UnsupportedParamsError(
+                            message="Deepinfra doesn't support tool_choice={}. To drop unsupported openai params from the call, set `litellm.drop_params = True`".format(
+                                value
+                            ),
+                            status_code=400,
+                        )
            if param in supported_openai_params:
-                optional_params[param] = value
+                if value is not None:
+                    optional_params[param] = value
        return optional_params


@ -1530,6 +1576,7 @@ class OpenAITextCompletion(BaseLLM):
                response = openai_client.completions.create(**data)  # type: ignore

                response_json = response.model_dump()
+
                ## LOGGING
                logging_obj.post_call(
                    input=prompt,
--- a/litellm/llms/predibase.py
+++ b/litellm/llms/predibase.py
@ -12,11 +12,11 @@ from typing import Callable, Optional, List, Literal, Union
 from litellm.utils import (
    ModelResponse,
    Usage,
-    map_finish_reason,
    CustomStreamWrapper,
    Message,
    Choices,
 )
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
@ -198,7 +198,7 @@ class PredibaseChatCompletion(BaseLLM):
        response: Union[requests.Response, httpx.Response],
        model_response: ModelResponse,
        stream: bool,
-        logging_obj: litellm.utils.Logging,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
        optional_params: dict,
        api_key: str,
        data: Union[dict, str],
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -1,24 +1,30 @@
+import json
+import re
+import traceback
+import uuid
+import xml.etree.ElementTree as ET
 from enum import Enum
-import requests, traceback
-import json, re, xml.etree.ElementTree as ET
-from jinja2 import Template, exceptions, meta, BaseLoader
-from jinja2.sandbox import ImmutableSandboxedEnvironment
 from typing import Any, List, Mapping, MutableMapping, Optional, Sequence, Tuple
+
+import requests
+from jinja2 import BaseLoader, Template, exceptions, meta
+from jinja2.sandbox import ImmutableSandboxedEnvironment
+
 import litellm
 import litellm.types
-from litellm.types.completion import (
-    ChatCompletionUserMessageParam,
-    ChatCompletionSystemMessageParam,
-    ChatCompletionMessageParam,
-    ChatCompletionFunctionMessageParam,
-    ChatCompletionMessageToolCallParam,
-    ChatCompletionToolMessageParam,
-)
 import litellm.types.llms
-from litellm.types.llms.anthropic import *
-import uuid
-from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
 import litellm.types.llms.vertex_ai
+from litellm.types.completion import (
+    ChatCompletionFunctionMessageParam,
+    ChatCompletionMessageParam,
+    ChatCompletionMessageToolCallParam,
+    ChatCompletionSystemMessageParam,
+    ChatCompletionToolMessageParam,
+    ChatCompletionUserMessageParam,
+)
+from litellm.types.llms.anthropic import *
+from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
+from litellm.types.utils import GenericImageParsingChunk


 def default_pt(messages):
@ -622,9 +628,10 @@ def construct_tool_use_system_prompt(


 def convert_url_to_base64(url):
-    import requests
    import base64

+    import requests
+
    for _ in range(3):
        try:
            response = requests.get(url)
@ -654,7 +661,7 @@ def convert_url_to_base64(url):
        raise Exception(f"Error: Unable to fetch image from URL. url={url}")


-def convert_to_anthropic_image_obj(openai_image_url: str):
+def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk:
    """
    Input:
    "image_url": "data:image/jpeg;base64,{base64_image}",
@ -675,11 +682,11 @@ def convert_to_anthropic_image_obj(openai_image_url: str):
        # Infer image format from the URL
        image_format = openai_image_url.split("data:image/")[1].split(";base64,")[0]

-        return {
-            "type": "base64",
-            "media_type": f"image/{image_format}",
-            "data": base64_data,
-        }
+        return GenericImageParsingChunk(
+            type="base64",
+            media_type=f"image/{image_format}",
+            data=base64_data,
+        )
    except Exception as e:
        if "Error: Unable to fetch image from URL" in str(e):
            raise e
@ -1606,19 +1613,23 @@ def azure_text_pt(messages: list):

 ###### AMAZON BEDROCK #######

+from litellm.types.llms.bedrock import ContentBlock as BedrockContentBlock
+from litellm.types.llms.bedrock import ImageBlock as BedrockImageBlock
+from litellm.types.llms.bedrock import ImageSourceBlock as BedrockImageSourceBlock
+from litellm.types.llms.bedrock import ToolBlock as BedrockToolBlock
 from litellm.types.llms.bedrock import (
-    ToolResultContentBlock as BedrockToolResultContentBlock,
-    ToolResultBlock as BedrockToolResultBlock,
-    ToolConfigBlock as BedrockToolConfigBlock,
-    ToolUseBlock as BedrockToolUseBlock,
-    ImageSourceBlock as BedrockImageSourceBlock,
-    ImageBlock as BedrockImageBlock,
-    ContentBlock as BedrockContentBlock,
-    ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
-    ToolSpecBlock as BedrockToolSpecBlock,
-    ToolBlock as BedrockToolBlock,
    ToolChoiceValuesBlock as BedrockToolChoiceValuesBlock,
 )
+from litellm.types.llms.bedrock import ToolConfigBlock as BedrockToolConfigBlock
+from litellm.types.llms.bedrock import (
+    ToolInputSchemaBlock as BedrockToolInputSchemaBlock,
+)
+from litellm.types.llms.bedrock import ToolResultBlock as BedrockToolResultBlock
+from litellm.types.llms.bedrock import (
+    ToolResultContentBlock as BedrockToolResultContentBlock,
+)
+from litellm.types.llms.bedrock import ToolSpecBlock as BedrockToolSpecBlock
+from litellm.types.llms.bedrock import ToolUseBlock as BedrockToolUseBlock


 def get_image_details(image_url) -> Tuple[str, str]:
@ -1655,7 +1666,8 @@ def get_image_details(image_url) -> Tuple[str, str]:
 def _process_bedrock_converse_image_block(image_url: str) -> BedrockImageBlock:
    if "base64" in image_url:
        # Case 1: Images with base64 encoding
-        import base64, re
+        import base64
+        import re

        # base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
        image_metadata, img_without_base_64 = image_url.split(",")
--- a/litellm/llms/text_completion_codestral.py
+++ b/litellm/llms/text_completion_codestral.py
@ -0,0 +1,532 @@
+# What is this?
+## Controller file for TextCompletionCodestral Integration - https://codestral.com/
+
+from functools import partial
+import os, types
+import traceback
+import json
+from enum import Enum
+import requests, copy  # type: ignore
+import time
+from typing import Callable, Optional, List, Literal, Union
+from litellm.utils import (
+    TextCompletionResponse,
+    Usage,
+    CustomStreamWrapper,
+    Message,
+    Choices,
+)
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
+from litellm.types.llms.databricks import GenericStreamingChunk
+import litellm
+from .prompt_templates.factory import prompt_factory, custom_prompt
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+from .base import BaseLLM
+import httpx  # type: ignore
+
+
+class TextCompletionCodestralError(Exception):
+    def __init__(
+        self,
+        status_code,
+        message,
+        request: Optional[httpx.Request] = None,
+        response: Optional[httpx.Response] = None,
+    ):
+        self.status_code = status_code
+        self.message = message
+        if request is not None:
+            self.request = request
+        else:
+            self.request = httpx.Request(
+                method="POST",
+                url="https://docs.codestral.com/user-guide/inference/rest_api",
+            )
+        if response is not None:
+            self.response = response
+        else:
+            self.response = httpx.Response(
+                status_code=status_code, request=self.request
+            )
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+async def make_call(
+    client: AsyncHTTPHandler,
+    api_base: str,
+    headers: dict,
+    data: str,
+    model: str,
+    messages: list,
+    logging_obj,
+):
+    response = await client.post(api_base, headers=headers, data=data, stream=True)
+
+    if response.status_code != 200:
+        raise TextCompletionCodestralError(
+            status_code=response.status_code, message=response.text
+        )
+
+    completion_stream = response.aiter_lines()
+    # LOGGING
+    logging_obj.post_call(
+        input=messages,
+        api_key="",
+        original_response=completion_stream,  # Pass the completion stream for logging
+        additional_args={"complete_input_dict": data},
+    )
+
+    return completion_stream
+
+
+class MistralTextCompletionConfig:
+    """
+    Reference: https://docs.mistral.ai/api/#operation/createFIMCompletion
+    """
+
+    suffix: Optional[str] = None
+    temperature: Optional[int] = None
+    top_p: Optional[float] = None
+    max_tokens: Optional[int] = None
+    min_tokens: Optional[int] = None
+    stream: Optional[bool] = None
+    random_seed: Optional[int] = None
+    stop: Optional[str] = None
+
+    def __init__(
+        self,
+        suffix: Optional[str] = None,
+        temperature: Optional[int] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        min_tokens: Optional[int] = None,
+        stream: Optional[bool] = None,
+        random_seed: Optional[int] = None,
+        stop: Optional[str] = None,
+    ) -> None:
+        locals_ = locals().copy()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "suffix",
+            "temperature",
+            "top_p",
+            "max_tokens",
+            "stream",
+            "seed",
+            "stop",
+        ]
+
+    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "suffix":
+                optional_params["suffix"] = value
+            if param == "temperature":
+                optional_params["temperature"] = value
+            if param == "top_p":
+                optional_params["top_p"] = value
+            if param == "max_tokens":
+                optional_params["max_tokens"] = value
+            if param == "stream" and value == True:
+                optional_params["stream"] = value
+            if param == "stop":
+                optional_params["stop"] = value
+            if param == "seed":
+                optional_params["random_seed"] = value
+            if param == "min_tokens":
+                optional_params["min_tokens"] = value
+
+        return optional_params
+
+    def _chunk_parser(self, chunk_data: str) -> GenericStreamingChunk:
+        text = ""
+        is_finished = False
+        finish_reason = None
+        logprobs = None
+
+        chunk_data = chunk_data.replace("data:", "")
+        chunk_data = chunk_data.strip()
+        if len(chunk_data) == 0 or chunk_data == "[DONE]":
+            return {
+                "text": "",
+                "is_finished": is_finished,
+                "finish_reason": finish_reason,
+            }
+        chunk_data_dict = json.loads(chunk_data)
+        original_chunk = litellm.ModelResponse(**chunk_data_dict, stream=True)
+        _choices = chunk_data_dict.get("choices", []) or []
+        _choice = _choices[0]
+        text = _choice.get("delta", {}).get("content", "")
+
+        if _choice.get("finish_reason") is not None:
+            is_finished = True
+            finish_reason = _choice.get("finish_reason")
+            logprobs = _choice.get("logprobs")
+
+        return GenericStreamingChunk(
+            text=text,
+            original_chunk=original_chunk,
+            is_finished=is_finished,
+            finish_reason=finish_reason,
+            logprobs=logprobs,
+        )
+
+
+class CodestralTextCompletion(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _validate_environment(
+        self,
+        api_key: Optional[str],
+        user_headers: dict,
+    ) -> dict:
+        if api_key is None:
+            raise ValueError(
+                "Missing CODESTRAL_API_Key - Please add CODESTRAL_API_Key to your environment variables"
+            )
+        headers = {
+            "content-type": "application/json",
+            "Authorization": "Bearer {}".format(api_key),
+        }
+        if user_headers is not None and isinstance(user_headers, dict):
+            headers = {**headers, **user_headers}
+        return headers
+
+    def output_parser(self, generated_text: str):
+        """
+        Parse the output text to remove any special characters. In our current approach we just check for ChatML tokens.
+
+        Initial issue that prompted this - https://github.com/BerriAI/litellm/issues/763
+        """
+        chat_template_tokens = [
+            "<|assistant|>",
+            "<|system|>",
+            "<|user|>",
+            "<s>",
+            "</s>",
+        ]
+        for token in chat_template_tokens:
+            if generated_text.strip().startswith(token):
+                generated_text = generated_text.replace(token, "", 1)
+            if generated_text.endswith(token):
+                generated_text = generated_text[::-1].replace(token[::-1], "", 1)[::-1]
+        return generated_text
+
+    def process_text_completion_response(
+        self,
+        model: str,
+        response: Union[requests.Response, httpx.Response],
+        model_response: TextCompletionResponse,
+        stream: bool,
+        logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
+        optional_params: dict,
+        api_key: str,
+        data: Union[dict, str],
+        messages: list,
+        print_verbose,
+        encoding,
+    ) -> TextCompletionResponse:
+        ## LOGGING
+        logging_obj.post_call(
+            input=messages,
+            api_key=api_key,
+            original_response=response.text,
+            additional_args={"complete_input_dict": data},
+        )
+        print_verbose(f"codestral api: raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        if response.status_code != 200:
+            raise TextCompletionCodestralError(
+                message=str(response.text),
+                status_code=response.status_code,
+            )
+        try:
+            completion_response = response.json()
+        except:
+            raise TextCompletionCodestralError(message=response.text, status_code=422)
+
+        _original_choices = completion_response.get("choices", [])
+        _choices: List[litellm.utils.TextChoices] = []
+        for choice in _original_choices:
+            # This is what 1 choice looks like from codestral API
+            # {
+            #     "index": 0,
+            #     "message": {
+            #     "role": "assistant",
+            #     "content": "\n assert is_odd(1)\n assert",
+            #     "tool_calls": null
+            #     },
+            #     "finish_reason": "length",
+            #     "logprobs": null
+            #     }
+            _finish_reason = None
+            _index = 0
+            _text = None
+            _logprobs = None
+
+            _choice_message = choice.get("message", {})
+            _choice = litellm.utils.TextChoices(
+                finish_reason=choice.get("finish_reason"),
+                index=choice.get("index"),
+                text=_choice_message.get("content"),
+                logprobs=choice.get("logprobs"),
+            )
+
+            _choices.append(_choice)
+
+        _response = litellm.TextCompletionResponse(
+            id=completion_response.get("id"),
+            choices=_choices,
+            created=completion_response.get("created"),
+            model=completion_response.get("model"),
+            usage=completion_response.get("usage"),
+            stream=False,
+            object=completion_response.get("object"),
+        )
+        return _response
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        custom_prompt_dict: dict,
+        model_response: TextCompletionResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key: str,
+        logging_obj,
+        optional_params: dict,
+        timeout: Union[float, httpx.Timeout],
+        acompletion=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers: dict = {},
+    ) -> Union[TextCompletionResponse, CustomStreamWrapper]:
+        headers = self._validate_environment(api_key, headers)
+
+        completion_url = api_base or "https://codestral.mistral.ai/v1/fim/completions"
+
+        if model in custom_prompt_dict:
+            # check if the model has a registered custom prompt
+            model_prompt_details = custom_prompt_dict[model]
+            prompt = custom_prompt(
+                role_dict=model_prompt_details["roles"],
+                initial_prompt_value=model_prompt_details["initial_prompt_value"],
+                final_prompt_value=model_prompt_details["final_prompt_value"],
+                messages=messages,
+            )
+        else:
+            prompt = prompt_factory(model=model, messages=messages)
+
+        ## Load Config
+        config = litellm.MistralTextCompletionConfig.get_config()
+        for k, v in config.items():
+            if (
+                k not in optional_params
+            ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                optional_params[k] = v
+
+        stream = optional_params.pop("stream", False)
+
+        data = {
+            "prompt": prompt,
+            **optional_params,
+        }
+        input_text = prompt
+        ## LOGGING
+        logging_obj.pre_call(
+            input=input_text,
+            api_key=api_key,
+            additional_args={
+                "complete_input_dict": data,
+                "headers": headers,
+                "api_base": completion_url,
+                "acompletion": acompletion,
+            },
+        )
+        ## COMPLETION CALL
+        if acompletion is True:
+            ### ASYNC STREAMING
+            if stream is True:
+                return self.async_streaming(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=completion_url,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                    timeout=timeout,
+                )  # type: ignore
+            else:
+                ### ASYNC COMPLETION
+                return self.async_completion(
+                    model=model,
+                    messages=messages,
+                    data=data,
+                    api_base=completion_url,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    encoding=encoding,
+                    api_key=api_key,
+                    logging_obj=logging_obj,
+                    optional_params=optional_params,
+                    stream=False,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    headers=headers,
+                    timeout=timeout,
+                )  # type: ignore
+
+        ### SYNC STREAMING
+        if stream is True:
+            response = requests.post(
+                completion_url,
+                headers=headers,
+                data=json.dumps(data),
+                stream=stream,
+            )
+            _response = CustomStreamWrapper(
+                response.iter_lines(),
+                model,
+                custom_llm_provider="codestral",
+                logging_obj=logging_obj,
+            )
+            return _response
+        ### SYNC COMPLETION
+        else:
+            response = requests.post(
+                url=completion_url,
+                headers=headers,
+                data=json.dumps(data),
+            )
+        return self.process_text_completion_response(
+            model=model,
+            response=response,
+            model_response=model_response,
+            stream=optional_params.get("stream", False),
+            logging_obj=logging_obj,  # type: ignore
+            optional_params=optional_params,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
+            encoding=encoding,
+        )
+
+    async def async_completion(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        model_response: TextCompletionResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        stream,
+        data: dict,
+        optional_params: dict,
+        timeout: Union[float, httpx.Timeout],
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ) -> TextCompletionResponse:
+
+        async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout))
+        try:
+            response = await async_handler.post(
+                api_base, headers=headers, data=json.dumps(data)
+            )
+        except httpx.HTTPStatusError as e:
+            raise TextCompletionCodestralError(
+                status_code=e.response.status_code,
+                message="HTTPStatusError - {}".format(e.response.text),
+            )
+        except Exception as e:
+            raise TextCompletionCodestralError(
+                status_code=500, message="{}\n{}".format(str(e), traceback.format_exc())
+            )
+        return self.process_text_completion_response(
+            model=model,
+            response=response,
+            model_response=model_response,
+            stream=stream,
+            logging_obj=logging_obj,
+            api_key=api_key,
+            data=data,
+            messages=messages,
+            print_verbose=print_verbose,
+            optional_params=optional_params,
+            encoding=encoding,
+        )
+
+    async def async_streaming(
+        self,
+        model: str,
+        messages: list,
+        api_base: str,
+        model_response: TextCompletionResponse,
+        print_verbose: Callable,
+        encoding,
+        api_key,
+        logging_obj,
+        data: dict,
+        timeout: Union[float, httpx.Timeout],
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+        headers={},
+    ) -> CustomStreamWrapper:
+        data["stream"] = True
+
+        streamwrapper = CustomStreamWrapper(
+            completion_stream=None,
+            make_call=partial(
+                make_call,
+                api_base=api_base,
+                headers=headers,
+                data=json.dumps(data),
+                model=model,
+                messages=messages,
+                logging_obj=logging_obj,
+            ),
+            model=model,
+            custom_llm_provider="text-completion-codestral",
+            logging_obj=logging_obj,
+        )
+        return streamwrapper
+
+    def embedding(self, *args, **kwargs):
+        pass
--- a/litellm/llms/triton.py
+++ b/litellm/llms/triton.py
@ -4,7 +4,6 @@ from enum import Enum
 import requests, copy  # type: ignore
 import time
 from typing import Callable, Optional, List
-from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
 import litellm
 from .prompt_templates.factory import prompt_factory, custom_prompt
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -1,17 +1,22 @@
-import os, types
+import inspect
 import json
-from enum import Enum
-import requests  # type: ignore
+import os
 import time
-from typing import Callable, Optional, Union, List, Literal, Any
+import types
+import uuid
+from enum import Enum
+from typing import Any, Callable, List, Literal, Optional, Union
+
+import httpx  # type: ignore
+import requests  # type: ignore
 from pydantic import BaseModel
-from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
-import litellm, uuid
-import httpx, inspect  # type: ignore
-from litellm.types.llms.vertex_ai import *
+
+import litellm
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.llms.prompt_templates.factory import (
-    convert_to_gemini_tool_call_result,
+    convert_to_anthropic_image_obj,
    convert_to_gemini_tool_call_invoke,
+    convert_to_gemini_tool_call_result,
 )
 from litellm.types.files import (
    get_file_mime_type_for_file_type,
@ -19,6 +24,8 @@ from litellm.types.files import (
    is_gemini_1_5_accepted_file_type,
    is_video_file_type,
 )
+from litellm.types.llms.vertex_ai import *
+from litellm.utils import CustomStreamWrapper, ModelResponse, Usage


 class VertexAIError(Exception):
@ -273,28 +280,6 @@ def _get_image_bytes_from_url(image_url: str) -> bytes:
        raise Exception(f"An exception occurs with this image - {str(e)}")


-def _load_image_from_url(image_url: str):
-    """
-    Loads an image from a URL.
-
-    Args:
-        image_url (str): The URL of the image.
-
-    Returns:
-        Image: The loaded image.
-    """
-    from vertexai.preview.generative_models import (
-        GenerativeModel,
-        Part,
-        GenerationConfig,
-        Image,
-    )
-
-    image_bytes = _get_image_bytes_from_url(image_url)
-
-    return Image.from_bytes(data=image_bytes)
-
-
 def _convert_gemini_role(role: str) -> Literal["user", "model"]:
    if role == "user":
        return "user"
@ -322,28 +307,9 @@ def _process_gemini_image(image_url: str) -> PartType:
            return PartType(file_data=file_data)

        # Direct links
-        elif "https:/" in image_url:
-            image = _load_image_from_url(image_url)
-            _blob = BlobType(data=image.data, mime_type=image._mime_type)
-            return PartType(inline_data=_blob)
-
-        # Base64 encoding
-        elif "base64" in image_url:
-            import base64, re
-
-            # base 64 is passed as data:image/jpeg;base64,<base-64-encoded-image>
-            image_metadata, img_without_base_64 = image_url.split(",")
-
-            # read mime_type from img_without_base_64=data:image/jpeg;base64
-            # Extract MIME type using regular expression
-            mime_type_match = re.match(r"data:(.*?);base64", image_metadata)
-
-            if mime_type_match:
-                mime_type = mime_type_match.group(1)
-            else:
-                mime_type = "image/jpeg"
-            decoded_img = base64.b64decode(img_without_base_64)
-            _blob = BlobType(data=decoded_img, mime_type=mime_type)
+        elif "https:/" in image_url or "base64" in image_url:
+            image = convert_to_anthropic_image_obj(image_url)
+            _blob = BlobType(data=image["data"], mime_type=image["media_type"])
            return PartType(inline_data=_blob)
        raise Exception("Invalid image received - {}".format(image_url))
    except Exception as e:
@ -371,7 +337,7 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
                _parts: List[PartType] = []
                for element in messages[msg_i]["content"]:
                    if isinstance(element, dict):
-                        if element["type"] == "text":
+                        if element["type"] == "text" and len(element["text"]) > 0:
                            _part = PartType(text=element["text"])
                            _parts.append(_part)
                        elif element["type"] == "image_url":
@ -379,7 +345,10 @@ def _gemini_convert_messages_with_history(messages: list) -> List[ContentType]:
                            _part = _process_gemini_image(image_url=image_url)
                            _parts.append(_part)  # type: ignore
                user_content.extend(_parts)
-            else:
+            elif (
+                isinstance(messages[msg_i]["content"], str)
+                and len(messages[msg_i]["content"]) > 0
+            ):
                _part = PartType(text=messages[msg_i]["content"])
                user_content.append(_part)

@ -479,23 +448,25 @@ def completion(
            message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
        )
    try:
+        import google.auth  # type: ignore
+        import proto  # type: ignore
+        from google.cloud import aiplatform  # type: ignore
+        from google.cloud.aiplatform_v1beta1.types import (
+            content as gapic_content_types,  # type: ignore
+        )
+        from google.protobuf import json_format  # type: ignore
+        from google.protobuf.struct_pb2 import Value  # type: ignore
+        from vertexai.language_models import CodeGenerationModel, TextGenerationModel
+        from vertexai.preview.generative_models import (
+            GenerationConfig,
+            GenerativeModel,
+            Part,
+        )
        from vertexai.preview.language_models import (
            ChatModel,
            CodeChatModel,
            InputOutputTextPair,
        )
-        from vertexai.language_models import TextGenerationModel, CodeGenerationModel
-        from vertexai.preview.generative_models import (
-            GenerativeModel,
-            Part,
-            GenerationConfig,
-        )
-        from google.cloud import aiplatform  # type: ignore
-        from google.protobuf import json_format  # type: ignore
-        from google.protobuf.struct_pb2 import Value  # type: ignore
-        from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types  # type: ignore
-        import google.auth  # type: ignore
-        import proto  # type: ignore

        ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
        print_verbose(
@ -617,7 +588,7 @@ def completion(
            llm_model = None

        # NOTE: async prediction and streaming under "private" mode isn't supported by aiplatform right now
-        if acompletion == True:
+        if acompletion is True:
            data = {
                "llm_model": llm_model,
                "mode": mode,
@ -649,7 +620,7 @@ def completion(
            tools = optional_params.pop("tools", None)
            content = _gemini_convert_messages_with_history(messages=messages)
            stream = optional_params.pop("stream", False)
-            if stream == True:
+            if stream is True:
                request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
                logging_obj.pre_call(
                    input=prompt,
@ -1411,8 +1382,8 @@ def embedding(
            message="vertexai import failed please run `pip install google-cloud-aiplatform`",
        )

-    from vertexai.language_models import TextEmbeddingModel, TextEmbeddingInput
    import google.auth  # type: ignore
+    from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

    ## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
    try:
--- a/litellm/llms/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_anthropic.py
@ -6,7 +6,8 @@ from enum import Enum
 import requests, copy  # type: ignore
 import time, uuid
 from typing import Callable, Optional, List
-from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
+from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
+from litellm.litellm_core_utils.core_helpers import map_finish_reason
 import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from .prompt_templates.factory import (
@ -237,7 +238,10 @@ def completion(
            if vertex_credentials is not None and isinstance(vertex_credentials, str):
                import google.oauth2.service_account

-                json_obj = json.loads(vertex_credentials)
+                try:
+                    json_obj = json.loads(vertex_credentials)
+                except json.JSONDecodeError:
+                    json_obj = json.load(open(vertex_credentials))

                creds = (
                    google.oauth2.service_account.Credentials.from_service_account_info(
--- a/litellm/llms/vertex_httpx.py
+++ b/litellm/llms/vertex_httpx.py
--- a/litellm/main.py
+++ b/litellm/main.py
@ -7,107 +7,132 @@
 #
 #  Thank you ! We ❤️ you! - Krrish & Ishaan

-import os, openai, sys, json, inspect, uuid, datetime, threading
-from typing import Any, Literal, Union, BinaryIO
-from typing_extensions import overload
-from functools import partial
-
-import dotenv, traceback, random, asyncio, time, contextvars
+import asyncio
+import contextvars
+import datetime
+import inspect
+import json
+import os
+import random
+import sys
+import threading
+import time
+import traceback
+import uuid
+from concurrent.futures import ThreadPoolExecutor
 from copy import deepcopy
+from functools import partial
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Union,
+)
+
+import dotenv
 import httpx
+import openai
+import tiktoken
+from typing_extensions import overload
+
 import litellm
-from ._logging import verbose_logger
 from litellm import (  # type: ignore
+    Logging,
    client,
    exception_type,
-    get_optional_params,
    get_litellm_params,
-    Logging,
+    get_optional_params,
 )
 from litellm.utils import (
-    get_secret,
    CustomStreamWrapper,
-    read_config_args,
-    completion_with_fallbacks,
-    get_llm_provider,
-    get_api_key,
-    mock_completion_streaming_obj,
+    Usage,
    async_mock_completion_streaming_obj,
+    completion_with_fallbacks,
    convert_to_model_response_object,
-    token_counter,
    create_pretrained_tokenizer,
    create_tokenizer,
-    Usage,
+    get_api_key,
+    get_llm_provider,
    get_optional_params_embeddings,
    get_optional_params_image_gen,
+    get_secret,
+    mock_completion_streaming_obj,
+    read_config_args,
    supports_httpx_timeout,
+    token_counter,
 )
+
+from ._logging import verbose_logger
+from .caching import disable_cache, enable_cache, update_cache
 from .llms import (
-    anthropic_text,
-    together_ai,
    ai21,
-    sagemaker,
-    bedrock,
-    triton,
-    huggingface_restapi,
-    replicate,
    aleph_alpha,
-    nlp_cloud,
+    anthropic_text,
    baseten,
-    vllm,
-    ollama,
-    ollama_chat,
-    cloudflare,
+    bedrock,
    clarifai,
+    cloudflare,
    cohere,
    cohere_chat,
-    petals,
+    gemini,
+    huggingface_restapi,
+    maritalk,
+    nlp_cloud,
+    ollama,
+    ollama_chat,
    oobabooga,
    openrouter,
    palm,
-    gemini,
+    petals,
+    replicate,
+    sagemaker,
+    together_ai,
+    triton,
    vertex_ai,
    vertex_ai_anthropic,
-    maritalk,
+    vllm,
    watsonx,
 )
-from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
-from .llms.azure import AzureChatCompletion
-from .llms.databricks import DatabricksChatCompletion
-from .llms.azure_text import AzureTextCompletion
 from .llms.anthropic import AnthropicChatCompletion
 from .llms.anthropic_text import AnthropicTextCompletion
+from .llms.azure import AzureChatCompletion
+from .llms.azure_text import AzureTextCompletion
+from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
+from .llms.databricks import DatabricksChatCompletion
 from .llms.huggingface_restapi import Huggingface
+from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.predibase import PredibaseChatCompletion
-from .llms.bedrock_httpx import BedrockLLM, BedrockConverseLLM
-from .llms.vertex_httpx import VertexLLM
-from .llms.triton import TritonChatCompletion
 from .llms.prompt_templates.factory import (
-    prompt_factory,
    custom_prompt,
    function_call_prompt,
    map_system_message_pt,
+    prompt_factory,
 )
-import tiktoken
-from concurrent.futures import ThreadPoolExecutor
-from typing import Callable, List, Optional, Dict, Union, Mapping
-from .caching import enable_cache, disable_cache, update_cache
+from .llms.text_completion_codestral import CodestralTextCompletion
+from .llms.triton import TritonChatCompletion
+from .llms.vertex_httpx import VertexLLM
 from .types.llms.openai import HttpxBinaryResponseContent
+from .types.utils import ChatCompletionMessageToolCall

 encoding = tiktoken.get_encoding("cl100k_base")
 from litellm.utils import (
-    get_secret,
+    Choices,
    CustomStreamWrapper,
-    TextCompletionStreamWrapper,
-    ModelResponse,
-    TextCompletionResponse,
-    TextChoices,
    EmbeddingResponse,
    ImageResponse,
-    read_config_args,
-    Choices,
    Message,
+    ModelResponse,
+    TextChoices,
+    TextCompletionResponse,
+    TextCompletionStreamWrapper,
    TranscriptionResponse,
+    get_secret,
+    read_config_args,
 )

 ####### ENVIRONMENT VARIABLES ###################
@ -120,6 +145,7 @@ azure_chat_completions = AzureChatCompletion()
 azure_text_completions = AzureTextCompletion()
 huggingface = Huggingface()
 predibase_chat_completions = PredibaseChatCompletion()
+codestral_text_completions = CodestralTextCompletion()
 triton_chat_completions = TritonChatCompletion()
 bedrock_chat_completion = BedrockLLM()
 bedrock_converse_chat_completion = BedrockConverseLLM()
@ -322,6 +348,8 @@ async def acompletion(
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "codestral"
+            or custom_llm_provider == "text-completion-codestral"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "text-completion-openai"
            or custom_llm_provider == "huggingface"
@ -329,6 +357,7 @@ async def acompletion(
            or custom_llm_provider == "ollama_chat"
            or custom_llm_provider == "replicate"
            or custom_llm_provider == "vertex_ai"
+            or custom_llm_provider == "vertex_ai_beta"
            or custom_llm_provider == "gemini"
            or custom_llm_provider == "sagemaker"
            or custom_llm_provider == "anthropic"
@ -350,9 +379,10 @@ async def acompletion(
            else:
                response = init_response  # type: ignore

-            if custom_llm_provider == "text-completion-openai" and isinstance(
-                response, TextCompletionResponse
-            ):
+            if (
+                custom_llm_provider == "text-completion-openai"
+                or custom_llm_provider == "text-completion-codestral"
+            ) and isinstance(response, TextCompletionResponse):
                response = litellm.OpenAITextCompletionConfig().convert_to_chat_model_response_object(
                    response_object=response,
                    model_response_object=litellm.ModelResponse(),
@ -367,7 +397,9 @@ async def acompletion(
        return response
    except Exception as e:
        verbose_logger.error(
-            "litellm.acompletion(): Exception occured - {}".format(str(e))
+            "litellm.acompletion(): Exception occured - {}\n{}".format(
+                str(e), traceback.format_exc()
+            )
        )
        verbose_logger.debug(traceback.format_exc())
        custom_llm_provider = custom_llm_provider or "openai"
@ -397,7 +429,9 @@ def mock_completion(
    messages: List,
    stream: Optional[bool] = False,
    mock_response: Union[str, Exception] = "This is a mock request",
+    mock_tool_calls: Optional[List] = None,
    logging=None,
+    custom_llm_provider=None,
    **kwargs,
 ):
    """
@ -435,7 +469,7 @@ def mock_completion(
            raise litellm.APIError(
                status_code=getattr(mock_response, "status_code", 500),  # type: ignore
                message=getattr(mock_response, "text", str(mock_response)),
-                llm_provider=getattr(mock_response, "llm_provider", "openai"),  # type: ignore
+                llm_provider=getattr(mock_response, "llm_provider", custom_llm_provider or "openai"),  # type: ignore
                model=model,  # type: ignore
                request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
            )
@ -464,6 +498,12 @@ def mock_completion(
        model_response["created"] = int(time.time())
        model_response["model"] = model

+        if mock_tool_calls:
+            model_response["choices"][0]["message"]["tool_calls"] = [
+                ChatCompletionMessageToolCall(**tool_call)
+                for tool_call in mock_tool_calls
+            ]
+
        setattr(
            model_response,
            "usage",
@ -577,6 +617,7 @@ def completion(
    args = locals()
    api_base = kwargs.get("api_base", None)
    mock_response = kwargs.get("mock_response", None)
+    mock_tool_calls = kwargs.get("mock_tool_calls", None)
    force_timeout = kwargs.get("force_timeout", 600)  ## deprecated
    logger_fn = kwargs.get("logger_fn", None)
    verbose = kwargs.get("verbose", False)
@ -895,15 +936,17 @@ def completion(
            litellm_params=litellm_params,
            custom_llm_provider=custom_llm_provider,
        )
-        if mock_response:
+        if mock_response or mock_tool_calls:
            return mock_completion(
                model,
                messages,
                stream=stream,
                mock_response=mock_response,
+                mock_tool_calls=mock_tool_calls,
                logging=logging,
                acompletion=acompletion,
                mock_delay=kwargs.get("mock_delay", None),
+                custom_llm_provider=custom_llm_provider,
            )
        if custom_llm_provider == "azure":
            # azure configs
@ -1035,91 +1078,6 @@ def completion(
                        "api_base": api_base,
                    },
                )
-        elif (
-            model in litellm.open_ai_chat_completion_models
-            or custom_llm_provider == "custom_openai"
-            or custom_llm_provider == "deepinfra"
-            or custom_llm_provider == "perplexity"
-            or custom_llm_provider == "groq"
-            or custom_llm_provider == "deepseek"
-            or custom_llm_provider == "anyscale"
-            or custom_llm_provider == "mistral"
-            or custom_llm_provider == "openai"
-            or custom_llm_provider == "together_ai"
-            or custom_llm_provider in litellm.openai_compatible_providers
-            or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
-        ):  # allow user to make an openai call with a custom base
-            # note: if a user sets a custom base - we should ensure this works
-            # allow for the setting of dynamic and stateful api-bases
-            api_base = (
-                api_base  # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
-                or litellm.api_base
-                or get_secret("OPENAI_API_BASE")
-                or "https://api.openai.com/v1"
-            )
-            openai.organization = (
-                organization
-                or litellm.organization
-                or get_secret("OPENAI_ORGANIZATION")
-                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
-            )
-            # set API KEY
-            api_key = (
-                api_key
-                or litellm.api_key  # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
-                or litellm.openai_key
-                or get_secret("OPENAI_API_KEY")
-            )
-
-            headers = headers or litellm.headers
-
-            ## LOAD CONFIG - if set
-            config = litellm.OpenAIConfig.get_config()
-            for k, v in config.items():
-                if (
-                    k not in optional_params
-                ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
-                    optional_params[k] = v
-
-            ## COMPLETION CALL
-            try:
-                response = openai_chat_completions.completion(
-                    model=model,
-                    messages=messages,
-                    headers=headers,
-                    model_response=model_response,
-                    print_verbose=print_verbose,
-                    api_key=api_key,
-                    api_base=api_base,
-                    acompletion=acompletion,
-                    logging_obj=logging,
-                    optional_params=optional_params,
-                    litellm_params=litellm_params,
-                    logger_fn=logger_fn,
-                    timeout=timeout,  # type: ignore
-                    custom_prompt_dict=custom_prompt_dict,
-                    client=client,  # pass AsyncOpenAI, OpenAI client
-                    organization=organization,
-                    custom_llm_provider=custom_llm_provider,
-                )
-            except Exception as e:
-                ## LOGGING - log the original exception returned
-                logging.post_call(
-                    input=messages,
-                    api_key=api_key,
-                    original_response=str(e),
-                    additional_args={"headers": headers},
-                )
-                raise e
-
-            if optional_params.get("stream", False):
-                ## LOGGING
-                logging.post_call(
-                    input=messages,
-                    api_key=api_key,
-                    original_response=response,
-                    additional_args={"headers": headers},
-                )
        elif (
            custom_llm_provider == "text-completion-openai"
            or "ft:babbage-002" in model
@ -1203,6 +1161,93 @@ def completion(
                    additional_args={"headers": headers},
                )
            response = _response
+
+        elif (
+            model in litellm.open_ai_chat_completion_models
+            or custom_llm_provider == "custom_openai"
+            or custom_llm_provider == "deepinfra"
+            or custom_llm_provider == "perplexity"
+            or custom_llm_provider == "groq"
+            or custom_llm_provider == "codestral"
+            or custom_llm_provider == "deepseek"
+            or custom_llm_provider == "anyscale"
+            or custom_llm_provider == "mistral"
+            or custom_llm_provider == "openai"
+            or custom_llm_provider == "together_ai"
+            or custom_llm_provider in litellm.openai_compatible_providers
+            or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
+        ):  # allow user to make an openai call with a custom base
+            # note: if a user sets a custom base - we should ensure this works
+            # allow for the setting of dynamic and stateful api-bases
+            api_base = (
+                api_base  # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
+                or litellm.api_base
+                or get_secret("OPENAI_API_BASE")
+                or "https://api.openai.com/v1"
+            )
+            openai.organization = (
+                organization
+                or litellm.organization
+                or get_secret("OPENAI_ORGANIZATION")
+                or None  # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
+            )
+            # set API KEY
+            api_key = (
+                api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or get_secret("OPENAI_API_KEY")
+            )
+
+            headers = headers or litellm.headers
+
+            ## LOAD CONFIG - if set
+            config = litellm.OpenAIConfig.get_config()
+            for k, v in config.items():
+                if (
+                    k not in optional_params
+                ):  # completion(top_k=3) > openai_config(top_k=3) <- allows for dynamic variables to be passed in
+                    optional_params[k] = v
+
+            ## COMPLETION CALL
+            try:
+                response = openai_chat_completions.completion(
+                    model=model,
+                    messages=messages,
+                    headers=headers,
+                    model_response=model_response,
+                    print_verbose=print_verbose,
+                    api_key=api_key,
+                    api_base=api_base,
+                    acompletion=acompletion,
+                    logging_obj=logging,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    logger_fn=logger_fn,
+                    timeout=timeout,  # type: ignore
+                    custom_prompt_dict=custom_prompt_dict,
+                    client=client,  # pass AsyncOpenAI, OpenAI client
+                    organization=organization,
+                    custom_llm_provider=custom_llm_provider,
+                )
+            except Exception as e:
+                ## LOGGING - log the original exception returned
+                logging.post_call(
+                    input=messages,
+                    api_key=api_key,
+                    original_response=str(e),
+                    additional_args={"headers": headers},
+                )
+                raise e
+
+            if optional_params.get("stream", False):
+                ## LOGGING
+                logging.post_call(
+                    input=messages,
+                    api_key=api_key,
+                    original_response=response,
+                    additional_args={"headers": headers},
+                )
        elif (
            "replicate" in model
            or custom_llm_provider == "replicate"
@ -1840,7 +1885,25 @@ def completion(
                )
                return response
            response = model_response
-        elif custom_llm_provider == "gemini":
+        elif custom_llm_provider == "vertex_ai_beta" or custom_llm_provider == "gemini":
+            vertex_ai_project = (
+                optional_params.pop("vertex_project", None)
+                or optional_params.pop("vertex_ai_project", None)
+                or litellm.vertex_project
+                or get_secret("VERTEXAI_PROJECT")
+            )
+            vertex_ai_location = (
+                optional_params.pop("vertex_location", None)
+                or optional_params.pop("vertex_ai_location", None)
+                or litellm.vertex_location
+                or get_secret("VERTEXAI_LOCATION")
+            )
+            vertex_credentials = (
+                optional_params.pop("vertex_credentials", None)
+                or optional_params.pop("vertex_ai_credentials", None)
+                or get_secret("VERTEXAI_CREDENTIALS")
+            )
+
            gemini_api_key = (
                api_key
                or get_secret("GEMINI_API_KEY")
@ -1848,34 +1911,28 @@ def completion(
                or litellm.api_key
            )

-            # palm does not support streaming as yet :(
-            model_response = gemini.completion(
+            new_params = deepcopy(optional_params)
+            response = vertex_chat_completion.completion(  # type: ignore
                model=model,
                messages=messages,
                model_response=model_response,
                print_verbose=print_verbose,
-                optional_params=optional_params,
+                optional_params=new_params,
                litellm_params=litellm_params,
                logger_fn=logger_fn,
                encoding=encoding,
-                api_key=gemini_api_key,
+                vertex_location=vertex_ai_location,
+                vertex_project=vertex_ai_project,
+                vertex_credentials=vertex_credentials,
+                gemini_api_key=gemini_api_key,
                logging_obj=logging,
                acompletion=acompletion,
-                custom_prompt_dict=custom_prompt_dict,
+                timeout=timeout,
+                custom_llm_provider=custom_llm_provider,
+                client=client,
+                api_base=api_base,
            )
-            if (
-                "stream" in optional_params
-                and optional_params["stream"] == True
-                and acompletion == False
-            ):
-                response = CustomStreamWrapper(
-                    iter(model_response),
-                    model,
-                    custom_llm_provider="gemini",
-                    logging_obj=logging,
-                )
-                return response
-            response = model_response
+
        elif custom_llm_provider == "vertex_ai":
            vertex_ai_project = (
                optional_params.pop("vertex_project", None)
@ -1894,6 +1951,7 @@ def completion(
                or optional_params.pop("vertex_ai_credentials", None)
                or get_secret("VERTEXAI_CREDENTIALS")
            )
+
            new_params = deepcopy(optional_params)
            if "claude-3" in model:
                model_response = vertex_ai_anthropic.completion(
@ -1982,6 +2040,46 @@ def completion(
                timeout=timeout,
            )

+            if (
+                "stream" in optional_params
+                and optional_params["stream"] is True
+                and acompletion is False
+            ):
+                return _model_response
+            response = _model_response
+        elif custom_llm_provider == "text-completion-codestral":
+
+            api_base = (
+                api_base
+                or optional_params.pop("api_base", None)
+                or optional_params.pop("base_url", None)
+                or litellm.api_base
+                or "https://codestral.mistral.ai/v1/fim/completions"
+            )
+
+            api_key = api_key or litellm.api_key or get_secret("CODESTRAL_API_KEY")
+
+            text_completion_model_response = litellm.TextCompletionResponse(
+                stream=stream
+            )
+
+            _model_response = codestral_text_completions.completion(  # type: ignore
+                model=model,
+                messages=messages,
+                model_response=text_completion_model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                logging_obj=logging,
+                acompletion=acompletion,
+                api_base=api_base,
+                custom_prompt_dict=custom_prompt_dict,
+                api_key=api_key,
+                timeout=timeout,
+            )
+
            if (
                "stream" in optional_params
                and optional_params["stream"] is True
@ -3371,7 +3469,9 @@ def embedding(

 ###### Text Completion ################
@client
-async def atext_completion(*args, **kwargs):
+async def atext_completion(
+    *args, **kwargs
+) -> Union[TextCompletionResponse, TextCompletionStreamWrapper]:
    """
    Implemented to handle async streaming for the text completion endpoint
    """
@ -3403,6 +3503,7 @@ async def atext_completion(*args, **kwargs):
            or custom_llm_provider == "deepinfra"
            or custom_llm_provider == "perplexity"
            or custom_llm_provider == "groq"
+            or custom_llm_provider == "text-completion-codestral"
            or custom_llm_provider == "deepseek"
            or custom_llm_provider == "fireworks_ai"
            or custom_llm_provider == "text-completion-openai"
@ -3664,6 +3765,7 @@ def text_completion(
            custom_llm_provider == "openai"
            or custom_llm_provider == "azure"
            or custom_llm_provider == "azure_text"
+            or custom_llm_provider == "text-completion-codestral"
            or custom_llm_provider == "text-completion-openai"
        )
        and isinstance(prompt, list)
@ -3680,6 +3782,12 @@ def text_completion(
        )

    kwargs.pop("prompt", None)
+
+    if model is not None and model.startswith(
+        "openai/"
+    ):  # for openai compatible endpoints - e.g. vllm, call the native /v1/completions endpoint for text completion calls
+        model = model.replace("openai/", "text-completion-openai/")
+
    kwargs["text_completion"] = True
    response = completion(
        model=model,
@ -3842,6 +3950,7 @@ def image_generation(
        proxy_server_request = kwargs.get("proxy_server_request", None)
        model_info = kwargs.get("model_info", None)
        metadata = kwargs.get("metadata", {})
+        client = kwargs.get("client", None)

        model_response = litellm.utils.ImageResponse()
        if model is not None or custom_llm_provider is not None:
@ -3980,6 +4089,7 @@ def image_generation(
                model_response=model_response,
                api_version=api_version,
                aimg_generation=aimg_generation,
+                client=client,
            )
        elif custom_llm_provider == "openai":
            model_response = openai_chat_completions.image_generation(
@ -3992,6 +4102,7 @@ def image_generation(
                optional_params=optional_params,
                model_response=model_response,
                aimg_generation=aimg_generation,
+                client=client,
            )
        elif custom_llm_provider == "bedrock":
            if model is None:
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -234,6 +234,30 @@
        "litellm_provider": "openai",
        "mode": "chat"
    },
+    "ft:gpt-4-0613": {
+        "max_tokens": 4096,
+        "max_input_tokens": 8192,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00003,
+        "output_cost_per_token": 0.00006,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
+    },
+    "ft:gpt-4o-2024-05-13": {
+        "max_tokens": 4096,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000005,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "openai",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "supports_vision": true,
+        "source": "OpenAI needs to add pricing for this ft model, will be updated when added by OpenAI. Defaulting to base model pricing"
+    },
    "ft:davinci-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -499,7 +523,7 @@
        "max_tokens": 4096,
        "max_input_tokens": 16384,
        "max_output_tokens": 4096,
-        "input_cost_per_token": 0.0000015,
+        "input_cost_per_token": 0.000001,
        "output_cost_per_token": 0.000002,
        "litellm_provider": "azure",
        "mode": "chat",
@ -841,7 +865,7 @@
    },
    "deepseek-coder": {
        "max_tokens": 4096,
-        "max_input_tokens": 16000,
+        "max_input_tokens": 32000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000014,
        "output_cost_per_token": 0.00000028,
@ -862,7 +886,7 @@
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000010,
+        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000010,
        "litellm_provider": "groq",
        "mode": "chat",
@ -872,8 +896,8 @@
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000064,
-        "output_cost_per_token": 0.00000080,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
        "litellm_provider": "groq",
        "mode": "chat",
        "supports_function_calling": true
@ -991,6 +1015,18 @@
        "supports_vision": true,
        "tool_use_system_prompt_tokens": 159
    },
+    "claude-3-5-sonnet-20240620": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "anthropic",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "tool_use_system_prompt_tokens": 159
+    },
    "text-bison": {
        "max_tokens": 1024,
        "max_input_tokens": 8192,
@ -1155,30 +1191,42 @@
        "max_tokens": 8192,
        "max_input_tokens": 32760,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000025, 
-        "output_cost_per_token": 0.0000005,
+        "input_cost_per_image": 0.0025,
+        "input_cost_per_video_per_second": 0.002,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
    },
    "gemini-1.0-pro": { 
        "max_tokens": 8192,
        "max_input_tokens": 32760,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000025, 
-        "output_cost_per_token": 0.0000005,
+        "input_cost_per_image": 0.0025,
+        "input_cost_per_video_per_second": 0.002,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
    },
    "gemini-1.0-pro-001": { 
        "max_tokens": 8192,
        "max_input_tokens": 32760,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000025, 
-        "output_cost_per_token": 0.0000005,
+        "input_cost_per_image": 0.0025,
+        "input_cost_per_video_per_second": 0.002,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1188,8 +1236,12 @@
        "max_tokens": 8192,
        "max_input_tokens": 32760,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000025, 
-        "output_cost_per_token": 0.0000005,
+        "input_cost_per_image": 0.0025,
+        "input_cost_per_video_per_second": 0.002,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
@ -1199,14 +1251,157 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0.000001875,
+        "input_cost_per_image": 0.001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_video_per_second": 0.001315,
+        "input_cost_per_token": 0.000005, 
+        "input_cost_per_character": 0.00000125, 
+        "input_cost_per_token_above_128k_tokens": 0.00001, 
+        "input_cost_per_character_above_128k_tokens": 0.0000025, 
+        "output_cost_per_token": 0.000015,
+        "output_cost_per_character": 0.00000375,
+        "output_cost_per_token_above_128k_tokens": 0.00003,
+        "output_cost_per_character_above_128k_tokens": 0.0000075,
+        "output_cost_per_image": 0.00263,
+        "output_cost_per_video_per_second": 0.00263,
+        "output_cost_per_audio_per_second": 0.00025,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true, 
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.5-pro-001": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_image": 0.001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_video_per_second": 0.001315,
+        "input_cost_per_token": 0.000005, 
+        "input_cost_per_character": 0.00000125, 
+        "input_cost_per_token_above_128k_tokens": 0.00001, 
+        "input_cost_per_character_above_128k_tokens": 0.0000025, 
+        "output_cost_per_token": 0.000015,
+        "output_cost_per_character": 0.00000375,
+        "output_cost_per_token_above_128k_tokens": 0.00003,
+        "output_cost_per_character_above_128k_tokens": 0.0000075,
+        "output_cost_per_image": 0.00263,
+        "output_cost_per_video_per_second": 0.00263,
+        "output_cost_per_audio_per_second": 0.00025,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true, 
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.5-pro-preview-0514": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_image": 0.001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_video_per_second": 0.001315,
+        "input_cost_per_token": 0.000005, 
+        "input_cost_per_character": 0.00000125, 
+        "input_cost_per_token_above_128k_tokens": 0.00001, 
+        "input_cost_per_character_above_128k_tokens": 0.0000025, 
+        "output_cost_per_token": 0.000015,
+        "output_cost_per_character": 0.00000375,
+        "output_cost_per_token_above_128k_tokens": 0.00003,
+        "output_cost_per_character_above_128k_tokens": 0.0000075,
+        "output_cost_per_image": 0.00263,
+        "output_cost_per_video_per_second": 0.00263,
+        "output_cost_per_audio_per_second": 0.00025,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true, 
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.5-pro-preview-0215": { 
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_image": 0.001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_video_per_second": 0.001315,
+        "input_cost_per_token": 0.000005, 
+        "input_cost_per_character": 0.00000125, 
+        "input_cost_per_token_above_128k_tokens": 0.00001, 
+        "input_cost_per_character_above_128k_tokens": 0.0000025, 
+        "output_cost_per_token": 0.000015,
+        "output_cost_per_character": 0.00000375,
+        "output_cost_per_token_above_128k_tokens": 0.00003,
+        "output_cost_per_character_above_128k_tokens": 0.0000075,
+        "output_cost_per_image": 0.00263,
+        "output_cost_per_video_per_second": 0.00263,
+        "output_cost_per_audio_per_second": 0.00025,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_tool_choice": true, 
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
+    "gemini-1.5-pro-preview-0409": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "input_cost_per_image": 0.001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_video_per_second": 0.001315,
+        "input_cost_per_token": 0.000005, 
+        "input_cost_per_character": 0.00000125, 
+        "input_cost_per_token_above_128k_tokens": 0.00001, 
+        "input_cost_per_character_above_128k_tokens": 0.0000025, 
+        "output_cost_per_token": 0.000015,
+        "output_cost_per_character": 0.00000375,
+        "output_cost_per_token_above_128k_tokens": 0.00003,
+        "output_cost_per_character_above_128k_tokens": 0.0000075,
+        "output_cost_per_image": 0.00263,
+        "output_cost_per_video_per_second": 0.00263,
+        "output_cost_per_audio_per_second": 0.00025,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_tool_choice": true, 
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini-1.5-flash": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_image": 0.0001315,
+        "input_cost_per_video_per_second": 0.0001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "input_cost_per_token_above_128k_tokens": 0.000001, 
+        "input_cost_per_character_above_128k_tokens": 0.00000025, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
+        "output_cost_per_token_above_128k_tokens": 0.000003,
+        "output_cost_per_character_above_128k_tokens": 0.00000075,
+        "output_cost_per_image": 0.000263,
+        "output_cost_per_video_per_second": 0.000263,
+        "output_cost_per_audio_per_second": 0.00025,
+        "litellm_provider": "vertex_ai-language-models",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini-1.5-flash-001": {
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -1217,10 +1412,23 @@
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
        "max_pdf_size_mb": 30,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_image": 0.0001315,
+        "input_cost_per_video_per_second": 0.0001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "input_cost_per_token_above_128k_tokens": 0.000001, 
+        "input_cost_per_character_above_128k_tokens": 0.00000025, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
+        "output_cost_per_token_above_128k_tokens": 0.000003,
+        "output_cost_per_character_above_128k_tokens": 0.00000075,
+        "output_cost_per_image": 0.000263,
+        "output_cost_per_video_per_second": 0.000263,
+        "output_cost_per_audio_per_second": 0.00025,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
+        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
@ -1235,62 +1443,27 @@
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
        "max_pdf_size_mb": 30,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_image": 0.0001315,
+        "input_cost_per_video_per_second": 0.0001315,
+        "input_cost_per_audio_per_second": 0.000125,
+        "input_cost_per_token": 0.0000005, 
+        "input_cost_per_character": 0.000000125, 
+        "input_cost_per_token_above_128k_tokens": 0.000001, 
+        "input_cost_per_character_above_128k_tokens": 0.00000025, 
+        "output_cost_per_token": 0.0000015,
+        "output_cost_per_character": 0.000000375,
+        "output_cost_per_token_above_128k_tokens": 0.000003,
+        "output_cost_per_character_above_128k_tokens": 0.00000075,
+        "output_cost_per_image": 0.000263,
+        "output_cost_per_video_per_second": 0.000263,
+        "output_cost_per_audio_per_second": 0.00025,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
+        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
-    "gemini-1.5-pro-001": { 
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0.000001875,
-        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_tool_choice": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
-    },
-    "gemini-1.5-pro-preview-0514": { 
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0.000001875,
-        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_tool_choice": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
-    },
-    "gemini-1.5-pro-preview-0215": { 
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0.000001875,
-        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_tool_choice": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
-    },
-    "gemini-1.5-pro-preview-0409": {
-        "max_tokens": 8192,
-        "max_input_tokens": 1000000,
-        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.000000625, 
-        "output_cost_per_token": 0.000001875,
-        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat",
-        "supports_function_calling": true,
-        "supports_tool_choice": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
-    },
    "gemini-experimental": {
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -1359,6 +1532,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "vertex_ai/claude-3-5-sonnet@20240620": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "vertex_ai-anthropic_models",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "vertex_ai/claude-3-haiku@20240307": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -1538,6 +1722,27 @@
        "mode": "completion",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
+    "gemini/gemini-1.5-flash": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1000000,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30, 
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+    },
    "gemini/gemini-1.5-flash-latest": {
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
@ -1547,11 +1752,14 @@
        "max_video_length": 1,
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
-        "max_pdf_size_mb": 30,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "max_pdf_size_mb": 30, 
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
+        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
@ -1560,8 +1768,10 @@
        "max_tokens": 8192,
        "max_input_tokens": 32760,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.0, 
-        "output_cost_per_token": 0.0,
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_function_calling": true,
@ -1571,10 +1781,13 @@
        "max_tokens": 8192,
        "max_input_tokens": 1000000,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
+        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_tool_choice": true, 
@ -1584,10 +1797,13 @@
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0, 
-        "output_cost_per_token": 0,
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
+        "supports_system_messages": true,
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_tool_choice": true, 
@ -1597,8 +1813,10 @@
        "max_tokens": 2048,
        "max_input_tokens": 30720,
        "max_output_tokens": 2048,
-        "input_cost_per_token": 0.0, 
-        "output_cost_per_token": 0.0,
+        "input_cost_per_token": 0.00000035, 
+        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "output_cost_per_token": 0.00000105, 
+        "output_cost_per_token_above_128k_tokens": 0.0000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_function_calling": true,
@ -1796,6 +2014,15 @@
        "litellm_provider": "replicate",
        "mode": "chat"
    },
+    "openrouter/deepseek/deepseek-coder": {
+        "max_tokens": 4096,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000014,
+        "output_cost_per_token": 0.00000028,
+        "litellm_provider": "openrouter",
+        "mode": "chat"
+    },
    "openrouter/microsoft/wizardlm-2-8x22b:nitro": {
        "max_tokens": 65536,
        "input_cost_per_token": 0.000001,
@ -2349,6 +2576,17 @@
        "supports_function_calling": true,
        "supports_vision": true
    },
+    "anthropic.claude-3-5-sonnet-20240620-v1:0": {
+        "max_tokens": 4096, 
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000003,
+        "output_cost_per_token": 0.000015,
+        "litellm_provider": "bedrock",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_vision": true
+    },
    "anthropic.claude-3-haiku-20240307-v1:0": {
        "max_tokens": 4096, 
        "max_input_tokens": 200000,
@ -3377,6 +3615,24 @@
        "litellm_provider": "deepinfra",
        "mode": "chat"
    },
+    "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct": {
+        "max_tokens": 8191,
+        "max_input_tokens": 8191,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000008,
+        "output_cost_per_token": 0.00000008,
+        "litellm_provider": "deepinfra",
+        "mode": "chat"
+    },
+    "deepinfra/meta-llama/Meta-Llama-3-70B-Instruct": {
+        "max_tokens": 8191,
+        "max_input_tokens": 8191,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000059,
+        "output_cost_per_token": 0.00000079,
+        "litellm_provider": "deepinfra",
+        "mode": "chat"
+    },
    "deepinfra/01-ai/Yi-34B-200K": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
--- a/litellm/proxy/_experimental/out/_next/static/S9_6IC27HNWjJtr-LNaAO/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/S9_6IC27HNWjJtr-LNaAO/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/S9_6IC27HNWjJtr-LNaAO/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/S9_6IC27HNWjJtr-LNaAO/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/777-17b0c91edd3a24fe.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/777-17b0c91edd3a24fe.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/777-71fb78fdb4897cc3.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/777-71fb78fdb4897cc3.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-23af663d40748d68.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-23af663d40748d68.js
@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
--- a/Show more
+++ b/Show more