Merge branch 'main' into fix/reset-end-user-budget-by-duration

2025-04-26 03:04:13 +00:00 · 2025-02-12 08:24:06 +01:00 · 2025-02-12 08:24:06 +01:00 · 9f12cba8bc
commit 9f12cba8bc
parent 6b5469cd45 d918b089c6
231 changed files with 6692 additions and 1224 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -72,6 +72,7 @@ jobs:
            pip install "jsonschema==4.22.0"
            pip install "pytest-xdist==3.6.1"
            pip install "websockets==10.4"
+            pip uninstall posthog -y
      - save_cache:
          paths:
            - ./venv
@ -1517,6 +1518,117 @@ jobs:
      - store_test_results:
          path: test-results

+  proxy_multi_instance_tests:
+    machine:
+      image: ubuntu-2204:2023.10.1
+    resource_class: xlarge
+    working_directory: ~/project
+    steps:
+      - checkout
+      - run:
+          name: Install Docker CLI (In case it's not already installed)
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+      - run:
+          name: Install Python 3.9
+          command: |
+            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
+            bash miniconda.sh -b -p $HOME/miniconda
+            export PATH="$HOME/miniconda/bin:$PATH"
+            conda init bash
+            source ~/.bashrc
+            conda create -n myenv python=3.9 -y
+            conda activate myenv
+            python --version
+      - run:
+          name: Install Dependencies
+          command: |
+            pip install "pytest==7.3.1"
+            pip install "pytest-asyncio==0.21.1"
+            pip install aiohttp
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            pip install "pytest==7.3.1"
+            pip install "pytest-retry==1.6.3"
+            pip install "pytest-mock==3.12.0"
+            pip install "pytest-asyncio==0.21.1"
+      - run:
+          name: Build Docker image
+          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
+      - run:
+          name: Run Docker container 1
+          # intentionally give bad redis credentials here
+          # the OTEL test - should get this as a trace
+          command: |
+            docker run -d \
+              -p 4000:4000 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              --name my-app \
+              -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4000 \
+              --detailed_debug \
+      - run:
+          name: Run Docker container 2
+          command: |
+            docker run -d \
+              -p 4001:4001 \
+              -e DATABASE_URL=$PROXY_DATABASE_URL \
+              -e REDIS_HOST=$REDIS_HOST \
+              -e REDIS_PASSWORD=$REDIS_PASSWORD \
+              -e REDIS_PORT=$REDIS_PORT \
+              -e LITELLM_MASTER_KEY="sk-1234" \
+              -e LITELLM_LICENSE=$LITELLM_LICENSE \
+              -e USE_DDTRACE=True \
+              -e DD_API_KEY=$DD_API_KEY \
+              -e DD_SITE=$DD_SITE \
+              --name my-app-2 \
+              -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \
+              my-app:latest \
+              --config /app/config.yaml \
+              --port 4001 \
+              --detailed_debug
+      - run:
+          name: Install curl and dockerize
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y curl
+            sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
+            sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
+      - run:
+          name: Start outputting logs
+          command: docker logs -f my-app
+          background: true
+      - run:
+          name: Wait for instance 1 to be ready
+          command: dockerize -wait http://localhost:4000 -timeout 5m
+      - run:
+          name: Wait for instance 2 to be ready
+          command: dockerize -wait http://localhost:4001 -timeout 5m
+      - run:
+          name: Run tests
+          command: |
+            pwd
+            ls
+            python -m pytest -vv tests/multi_instance_e2e_tests -x --junitxml=test-results/junit.xml --durations=5
+          no_output_timeout:
+            120m
+            # Clean up first container
+      # Store test results
+      - store_test_results:
+          path: test-results
+
  proxy_store_model_in_db_tests:
    machine:
      image: ubuntu-2204:2023.10.1
@ -1552,6 +1664,7 @@ jobs:
            pip install "pytest-retry==1.6.3"
            pip install "pytest-mock==3.12.0"
            pip install "pytest-asyncio==0.21.1"
+            pip install "assemblyai==0.37.0"
      - run:
          name: Build Docker image
          command: docker build -t my-app:latest -f ./docker/Dockerfile.database .
@ -2171,6 +2284,12 @@ workflows:
              only:
                - main
                - /litellm_.*/
+      - proxy_multi_instance_tests:
+          filters:
+            branches:
+              only:
+                - main
+                - /litellm_.*/
      - proxy_store_model_in_db_tests:
          filters:
            branches:
@ -2302,6 +2421,7 @@ workflows:
            - installing_litellm_on_python
            - installing_litellm_on_python_3_13
            - proxy_logging_guardrails_model_info_tests
+            - proxy_multi_instance_tests
            - proxy_store_model_in_db_tests
            - proxy_build_from_pip_tests
            - proxy_pass_through_endpoint_tests
--- a/.env.example
+++ b/.env.example
@ -20,3 +20,8 @@ REPLICATE_API_TOKEN = ""
 ANTHROPIC_API_KEY = ""
 # Infisical
 INFISICAL_TOKEN = ""
+
+# Development Configs
+LITELLM_MASTER_KEY = "sk-1234"
+DATABASE_URL = "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
+STORE_MODEL_IN_DB = "True"
--- a/.github/workflows/interpret_load_test.py
+++ b/.github/workflows/interpret_load_test.py
@ -52,6 +52,39 @@ def interpret_results(csv_file):
    return markdown_table


+def _get_docker_run_command_stable_release(release_version):
+    return f"""
+    \n\n
+    ## Docker Run LiteLLM Proxy
+
+    ```
+    docker run \\
+    -e STORE_MODEL_IN_DB=True \\
+    -p 4000:4000 \\
+    ghcr.io/berriai/litellm_stable_release_branch-{release_version}
+    """
+
+
+def _get_docker_run_command(release_version):
+    return f"""
+    \n\n
+    ## Docker Run LiteLLM Proxy
+
+    ```
+    docker run \\
+    -e STORE_MODEL_IN_DB=True \\
+    -p 4000:4000 \\
+    ghcr.io/berriai/litellm:main-{release_version}
+    """
+
+
+def get_docker_run_command(release_version):
+    if "stable" in release_version:
+        return _get_docker_run_command_stable_release(release_version)
+    else:
+        return _get_docker_run_command(release_version)
+
+
 if __name__ == "__main__":
    csv_file = "load_test_stats.csv"  # Change this to the path of your CSV file
    markdown_table = interpret_results(csv_file)
@ -79,17 +112,7 @@ if __name__ == "__main__":
        start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
        existing_release_body = latest_release.body[:start_index]

-    docker_run_command = f"""
-\n\n
-## Docker Run LiteLLM Proxy
-
-```
-docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
-ghcr.io/berriai/litellm:main-{release_version}
-```
-    """
+    docker_run_command = get_docker_run_command(release_version)
    print("docker run command: ", docker_run_command)

    new_release_body = (
--- a/README.md
+++ b/README.md
@ -451,3 +451,20 @@ If you have suggestions on how to improve the code quality feel free to open an
 <a href="https://github.com/BerriAI/litellm/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=BerriAI/litellm" />
 </a>
+
+
+## Run in Developer mode
+### Services
+1. Setup .env file in root
+2. Run dependant services `docker-compose up db prometheus`
+
+### Backend
+1. (In root) create virtual environment `python -m venv .venv`
+2. Activate virtual environment `source .venv/bin/activate`
+3. Install dependencies `pip install -e ".[all]"`
+4. Start proxy backend `uvicorn litellm.proxy.proxy_server:app --host localhost --port 4000 --reload`
+
+### Frontend
+1. Navigate to `ui/litellm-dashboard`
+2. Install dependencies `npm install`
+3. Run `npm run dev` to start the dashboard
--- a/cookbook/logging_observability/LiteLLM_Arize.ipynb
+++ b/cookbook/logging_observability/LiteLLM_Arize.ipynb
@ -0,0 +1,172 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4FbDOmcj2VkM"
+      },
+      "source": [
+        "## Use LiteLLM with Arize\n",
+        "https://docs.litellm.ai/docs/observability/arize_integration\n",
+        "\n",
+        "This method uses the litellm proxy to send the data to Arize. The callback is set in the litellm config below, instead of using OpenInference tracing."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "21W8Woog26Ns"
+      },
+      "source": [
+        "## Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "xrjKLBxhxu2L"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: litellm in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (1.54.1)\n",
+            "Requirement already satisfied: aiohttp in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.11.10)\n",
+            "Requirement already satisfied: click in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.1.7)\n",
+            "Requirement already satisfied: httpx<0.28.0,>=0.23.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.27.2)\n",
+            "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (8.5.0)\n",
+            "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (3.1.4)\n",
+            "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (4.23.0)\n",
+            "Requirement already satisfied: openai>=1.55.3 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.57.1)\n",
+            "Requirement already satisfied: pydantic<3.0.0,>=2.0.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.10.3)\n",
+            "Requirement already satisfied: python-dotenv>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (1.0.1)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.31.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (2.32.3)\n",
+            "Requirement already satisfied: tiktoken>=0.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.7.0)\n",
+            "Requirement already satisfied: tokenizers in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from litellm) (0.21.0)\n",
+            "Requirement already satisfied: anyio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (4.7.0)\n",
+            "Requirement already satisfied: certifi in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (2024.8.30)\n",
+            "Requirement already satisfied: httpcore==1.* in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.0.7)\n",
+            "Requirement already satisfied: idna in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (3.10)\n",
+            "Requirement already satisfied: sniffio in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpx<0.28.0,>=0.23.0->litellm) (1.3.1)\n",
+            "Requirement already satisfied: h11<0.15,>=0.13 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from httpcore==1.*->httpx<0.28.0,>=0.23.0->litellm) (0.14.0)\n",
+            "Requirement already satisfied: zipp>=3.20 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from importlib-metadata>=6.8.0->litellm) (3.21.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jinja2<4.0.0,>=3.1.2->litellm) (3.0.2)\n",
+            "Requirement already satisfied: attrs>=22.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (24.2.0)\n",
+            "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (2024.10.1)\n",
+            "Requirement already satisfied: referencing>=0.28.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.35.1)\n",
+            "Requirement already satisfied: rpds-py>=0.7.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.22.3)\n",
+            "Requirement already satisfied: distro<2,>=1.7.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (1.9.0)\n",
+            "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (0.6.1)\n",
+            "Requirement already satisfied: tqdm>4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.67.1)\n",
+            "Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from openai>=1.55.3->litellm) (4.12.2)\n",
+            "Requirement already satisfied: annotated-types>=0.6.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (0.7.0)\n",
+            "Requirement already satisfied: pydantic-core==2.27.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from pydantic<3.0.0,>=2.0.0->litellm) (2.27.1)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (3.4.0)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.31.0->litellm) (2.0.7)\n",
+            "Requirement already satisfied: regex>=2022.1.18 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tiktoken>=0.7.0->litellm) (2024.11.6)\n",
+            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (2.4.4)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.3.1)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.5.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (6.1.0)\n",
+            "Requirement already satisfied: propcache>=0.2.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (0.2.1)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from aiohttp->litellm) (1.18.3)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from tokenizers->litellm) (0.26.5)\n",
+            "Requirement already satisfied: filelock in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (3.16.1)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (2024.10.0)\n",
+            "Requirement already satisfied: packaging>=20.9 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (24.2)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /Users/ericxiao/Documents/arize/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers->litellm) (6.0.2)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install litellm"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jHEu-TjZ29PJ"
+      },
+      "source": [
+        "## Set Env Variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "QWd9rTysxsWO"
+      },
+      "outputs": [],
+      "source": [
+        "import litellm\n",
+        "import os\n",
+        "from getpass import getpass\n",
+        "\n",
+        "os.environ[\"ARIZE_SPACE_KEY\"] = getpass(\"Enter your Arize space key: \")\n",
+        "os.environ[\"ARIZE_API_KEY\"] = getpass(\"Enter your Arize API key: \")\n",
+        "os.environ['OPENAI_API_KEY']= getpass(\"Enter your OpenAI API key: \")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's run a completion call and see the traces in Arize"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Hello! Nice to meet you, OpenAI. How can I assist you today?\n"
+          ]
+        }
+      ],
+      "source": [
+        "# set arize as a callback, litellm will send the data to arize\n",
+        "litellm.callbacks = [\"arize\"]\n",
+        " \n",
+        "# openai call\n",
+        "response = litellm.completion(\n",
+        "  model=\"gpt-3.5-turbo\",\n",
+        "  messages=[\n",
+        "    {\"role\": \"user\", \"content\": \"Hi 👋 - i'm openai\"}\n",
+        "  ]\n",
+        ")\n",
+        "print(response.choices[0].message.content)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -29,6 +29,8 @@ services:
      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
+    ports:
+      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -1,5 +1,5 @@
 # Local Debugging
-There's 2 ways to do local debugging - `litellm.set_verbose=True` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `set_verbose` in production. It logs API keys, which might end up in log files.
+There's 2 ways to do local debugging - `litellm._turn_on_debug()` and by passing in a custom function `completion(...logger_fn=<your_local_function>)`. Warning: Make sure to not use `_turn_on_debug()` in production. It logs API keys, which might end up in log files.

 ## Set Verbose 

@ -8,7 +8,7 @@ This is good for getting print statements for everything litellm is doing.
 import litellm
 from litellm import completion

-litellm.set_verbose=True # 👈 this is the 1-line change you need to make
+litellm._turn_on_debug() # 👈 this is the 1-line change you need to make

 ## set ENV variables
 os.environ["OPENAI_API_KEY"] = "openai key"
--- a/docs/my-website/docs/observability/arize_integration.md
+++ b/docs/my-website/docs/observability/arize_integration.md
@ -28,7 +28,7 @@ import litellm
 import os

 os.environ["ARIZE_SPACE_KEY"] = ""
-os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
+os.environ["ARIZE_API_KEY"] = ""

 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
--- a/docs/my-website/docs/pass_through/assembly_ai.md
+++ b/docs/my-website/docs/pass_through/assembly_ai.md
@ -12,6 +12,9 @@ Supports **ALL** Assembly AI Endpoints

 [**See All Assembly AI Endpoints**](https://www.assemblyai.com/docs/api-reference)

+
+<iframe width="840" height="500" src="https://www.loom.com/embed/aac3f4d74592448992254bfa79b9f62d?sid=267cd0ab-d92b-42fa-b97a-9f385ef8930c" frameborder="0" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>
+
 ## Quick Start

 Let's call the Assembly AI [`/v2/transcripts` endpoint](https://www.assemblyai.com/docs/api-reference/transcripts)
@ -35,6 +38,8 @@ litellm
 Let's call the Assembly AI `/v2/transcripts` endpoint

 ```python
+import assemblyai as aai
+
 LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
 LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/assemblyai" # <your-proxy-base-url>/assemblyai

@ -53,3 +58,28 @@ print(transcript)
 print(transcript.id)
 ```

+## Calling Assembly AI EU endpoints
+
+If you want to send your request to the Assembly AI EU endpoint, you can do so by setting the `LITELLM_PROXY_BASE_URL` to `<your-proxy-base-url>/eu.assemblyai`
+
+
+```python
+import assemblyai as aai
+
+LITELLM_VIRTUAL_KEY = "sk-1234" # <your-virtual-key>
+LITELLM_PROXY_BASE_URL = "http://0.0.0.0:4000/eu.assemblyai" # <your-proxy-base-url>/eu.assemblyai
+
+aai.settings.api_key = f"Bearer {LITELLM_VIRTUAL_KEY}"
+aai.settings.base_url = LITELLM_PROXY_BASE_URL
+
+# URL of the file to transcribe
+FILE_URL = "https://assembly.ai/wildfires.mp3"
+
+# You can also transcribe a local file by passing in a file path
+# FILE_URL = './path/to/file.mp3'
+
+transcriber = aai.Transcriber()
+transcript = transcriber.transcribe(FILE_URL)
+print(transcript)
+print(transcript.id)
+```
--- a/docs/my-website/docs/providers/anthropic.md
+++ b/docs/my-website/docs/providers/anthropic.md
@ -987,6 +987,106 @@ curl http://0.0.0.0:4000/v1/chat/completions \
 </TabItem>
 </Tabs>

+## [BETA] Citations API 
+
+Pass `citations: {"enabled": true}` to Anthropic, to get citations on your document responses. 
+
+Note: This interface is in BETA. If you have feedback on how citations should be returned, please [tell us here](https://github.com/BerriAI/litellm/issues/7970#issuecomment-2644437943)
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import completion
+
+resp = completion(
+    model="claude-3-5-sonnet-20241022",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "document",
+                    "source": {
+                        "type": "text",
+                        "media_type": "text/plain",
+                        "data": "The grass is green. The sky is blue.",
+                    },
+                    "title": "My Document",
+                    "context": "This is a trustworthy document.",
+                    "citations": {"enabled": True},
+                },
+                {
+                    "type": "text",
+                    "text": "What color is the grass and sky?",
+                },
+            ],
+        }
+    ],
+)
+
+citations = resp.choices[0].message.provider_specific_fields["citations"]
+
+assert citations is not None
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml
+
+```yaml
+model_list:
+    - model_name: anthropic-claude
+      litellm_params:
+        model: anthropic/claude-3-5-sonnet-20241022
+        api_key: os.environ/ANTHROPIC_API_KEY
+```
+
+2. Start proxy 
+
+```bash
+litellm --config /path/to/config.yaml
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it! 
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer sk-1234' \
+-d '{
+  "model": "anthropic-claude",
+  "messages": [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "document",
+                "source": {
+                    "type": "text",
+                    "media_type": "text/plain",
+                    "data": "The grass is green. The sky is blue.",
+                },
+                "title": "My Document",
+                "context": "This is a trustworthy document.",
+                "citations": {"enabled": True},
+            },
+            {
+                "type": "text",
+                "text": "What color is the grass and sky?",
+            },
+        ],
+    }
+  ]
+}'
+```
+
+</TabItem>
+</Tabs>
+
 ## Usage - passing 'user_id' to Anthropic

 LiteLLM translates the OpenAI `user` param to Anthropic's `metadata[user_id]` param.
--- a/docs/my-website/docs/providers/gemini.md
+++ b/docs/my-website/docs/providers/gemini.md
@ -688,7 +688,9 @@ response = litellm.completion(
 |-----------------------|--------------------------------------------------------|--------------------------------|
 | gemini-pro            | `completion(model='gemini/gemini-pro', messages)`            | `os.environ['GEMINI_API_KEY']` |
 | gemini-1.5-pro-latest | `completion(model='gemini/gemini-1.5-pro-latest', messages)` | `os.environ['GEMINI_API_KEY']` |
-| gemini-pro-vision     | `completion(model='gemini/gemini-pro-vision', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash     | `completion(model='gemini/gemini-2.0-flash', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash-exp     | `completion(model='gemini/gemini-2.0-flash-exp', messages)`     | `os.environ['GEMINI_API_KEY']` |
+| gemini-2.0-flash-lite-preview-02-05	     | `completion(model='gemini/gemini-2.0-flash-lite-preview-02-05', messages)`     | `os.environ['GEMINI_API_KEY']` |



--- a/docs/my-website/docs/proxy/guardrails/aim_security.md
+++ b/docs/my-website/docs/proxy/guardrails/aim_security.md
@ -37,7 +37,7 @@ guardrails:
  - guardrail_name: aim-protected-app
    litellm_params:
      guardrail: aim
-      mode: pre_call
+      mode: pre_call # 'during_call' is also available
      api_key: os.environ/AIM_API_KEY
      api_base: os.environ/AIM_API_BASE # Optional, use only when using a self-hosted Aim Outpost
 ```
--- a/docs/my-website/docs/proxy/timeout.md
+++ b/docs/my-website/docs/proxy/timeout.md
@ -166,7 +166,7 @@ response = client.chat.completions.create(
        {"role": "user", "content": "what color is red"}
    ],
    logit_bias={12481: 100},
-    timeout=1
+    extra_body={"timeout": 1} # 👈 KEY CHANGE
 )

 print(response)
--- a/docs/my-website/docs/proxy/token_auth.md
+++ b/docs/my-website/docs/proxy/token_auth.md
@ -163,10 +163,12 @@ scope: "litellm-proxy-admin ..."

 ```yaml
 general_settings:
-  master_key: sk-1234
+  enable_jwt_auth: True
  litellm_jwtauth:
    user_id_jwt_field: "sub"
    team_ids_jwt_field: "groups" 
+    user_id_upsert: true # add user_id to the db if they don't exist
+    enforce_team_based_model_access: true # don't allow users to access models unless the team has access
 ```

 This is assuming your token looks like this:
@ -370,4 +372,68 @@ Supported internal roles:
 - `internal_user`: User object will be used for RBAC spend tracking. Use this for tracking spend for an 'individual user'.
 - `proxy_admin`: Proxy admin will be used for RBAC spend tracking. Use this for granting admin access to a token.

-### [Architecture Diagram (Control Model Access)](./jwt_auth_arch)
+### [Architecture Diagram (Control Model Access)](./jwt_auth_arch)
+
+## [BETA] Control Model Access with Scopes
+
+Control which models a JWT can access. Set `enforce_scope_based_access: true` to enforce scope-based access control.
+
+### 1. Setup config.yaml with scope mappings.
+
+
+```yaml
+model_list:
+  - model_name: anthropic-claude
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet
+      api_key: os.environ/ANTHROPIC_API_KEY
+  - model_name: gpt-3.5-turbo-testing
+    litellm_params:
+      model: gpt-3.5-turbo
+      api_key: os.environ/OPENAI_API_KEY
+
+general_settings:
+  enable_jwt_auth: True
+  litellm_jwtauth:
+    team_id_jwt_field: "client_id" # 👈 set the field in the JWT token that contains the team id
+    team_id_upsert: true # 👈 upsert the team to db, if team id is not found in db
+    scope_mappings:
+      - scope: litellm.api.consumer
+        models: ["anthropic-claude"]
+      - scope: litellm.api.gpt_3_5_turbo
+        models: ["gpt-3.5-turbo-testing"]
+    enforce_scope_based_access: true # 👈 enforce scope-based access control
+    enforce_rbac: true # 👈 enforces only a Team/User/ProxyAdmin can access the proxy.
+```
+
+#### Scope Mapping Spec 
+
+- `scope`: The scope to be used for the JWT token.
+- `models`: The models that the JWT token can access. Value is the `model_name` in `model_list`. Note: Wildcard routes are not currently supported.
+
+### 2. Create a JWT with the correct scopes.
+
+Expected Token:
+
+```
+{
+  "scope": ["litellm.api.consumer", "litellm.api.gpt_3_5_turbo"]
+}
+```
+
+### 3. Test the flow.
+
+```bash
+curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-H 'Authorization: Bearer eyJhbGci...' \
+-d '{
+  "model": "gpt-3.5-turbo-testing",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hey, how'\''s it going 1234?"
+    }
+  ]
+}'
+```
--- a/litellm/init.py
+++ b/litellm/init.py
@ -360,7 +360,7 @@ BEDROCK_CONVERSE_MODELS = [
    "meta.llama3-2-90b-instruct-v1:0",
 ]
 BEDROCK_INVOKE_PROVIDERS_LITERAL = Literal[
-    "cohere", "anthropic", "mistral", "amazon", "meta", "llama", "ai21"
+    "cohere", "anthropic", "mistral", "amazon", "meta", "llama", "ai21", "nova"
 ]
 ####### COMPLETION MODELS ###################
 open_ai_chat_completion_models: List = []
@ -863,6 +863,9 @@ from .llms.bedrock.common_utils import (
 from .llms.bedrock.chat.invoke_transformations.amazon_ai21_transformation import (
    AmazonAI21Config,
 )
+from .llms.bedrock.chat.invoke_transformations.amazon_nova_transformation import (
+    AmazonInvokeNovaConfig,
+)
 from .llms.bedrock.chat.invoke_transformations.anthropic_claude2_transformation import (
    AmazonAnthropicConfig,
 )
--- a/litellm/integrations/additional_logging_utils.py
+++ b/litellm/integrations/additional_logging_utils.py
@ -0,0 +1,36 @@
+"""
+Base class for Additional Logging Utils for CustomLoggers 
+
+- Health Check for the logging util
+- Get Request / Response Payload for the logging util
+"""
+
+from abc import ABC, abstractmethod
+from datetime import datetime
+from typing import Optional
+
+from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
+
+
+class AdditionalLoggingUtils(ABC):
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    async def async_health_check(self) -> IntegrationHealthCheckStatus:
+        """
+        Check if the service is healthy
+        """
+        pass
+
+    @abstractmethod
+    async def get_request_response_payload(
+        self,
+        request_id: str,
+        start_time_utc: Optional[datetime],
+        end_time_utc: Optional[datetime],
+    ) -> Optional[dict]:
+        """
+        Get the request and response payload for a given `request_id`
+        """
+        return None
--- a/litellm/integrations/base_health_check.py
+++ b/litellm/integrations/base_health_check.py
@ -1,19 +0,0 @@
-"""
-Base class for health check integrations
-"""
-
-from abc import ABC, abstractmethod
-
-from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
-
-
-class HealthCheckIntegration(ABC):
-    def __init__(self):
-        super().__init__()
-
-    @abstractmethod
-    async def async_health_check(self) -> IntegrationHealthCheckStatus:
-        """
-        Check if the service is healthy
-        """
-        pass
--- a/litellm/integrations/datadog/datadog.py
+++ b/litellm/integrations/datadog/datadog.py
@ -38,14 +38,14 @@ from litellm.types.integrations.datadog import *
 from litellm.types.services import ServiceLoggerPayload
 from litellm.types.utils import StandardLoggingPayload

-from ..base_health_check import HealthCheckIntegration
+from ..additional_logging_utils import AdditionalLoggingUtils

 DD_MAX_BATCH_SIZE = 1000  # max number of logs DD API can accept


 class DataDogLogger(
    CustomBatchLogger,
-    HealthCheckIntegration,
+    AdditionalLoggingUtils,
 ):
    # Class variables or attributes
    def __init__(
@ -543,3 +543,13 @@ class DataDogLogger(
                status="unhealthy",
                error_message=str(e),
            )
+
+    async def get_request_response_payload(
+        self,
+        request_id: str,
+        start_time_utc: Optional[datetimeObj],
+        end_time_utc: Optional[datetimeObj],
+    ) -> Optional[dict]:
+        raise NotImplementedError(
+            "Datdog Integration for getting request/response payloads not implemented as yet"
+        )
--- a/litellm/integrations/gcs_bucket/gcs_bucket.py
+++ b/litellm/integrations/gcs_bucket/gcs_bucket.py
@ -1,12 +1,16 @@
 import asyncio
+import json
 import os
 import uuid
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from urllib.parse import quote

 from litellm._logging import verbose_logger
+from litellm.integrations.additional_logging_utils import AdditionalLoggingUtils
 from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
 from litellm.proxy._types import CommonProxyErrors
+from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
 from litellm.types.integrations.gcs_bucket import *
 from litellm.types.utils import StandardLoggingPayload

@ -20,7 +24,7 @@ GCS_DEFAULT_BATCH_SIZE = 2048
 GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20


-class GCSBucketLogger(GCSBucketBase):
+class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
    def __init__(self, bucket_name: Optional[str] = None) -> None:
        from litellm.proxy.proxy_server import premium_user

@ -39,6 +43,7 @@ class GCSBucketLogger(GCSBucketBase):
            batch_size=self.batch_size,
            flush_interval=self.flush_interval,
        )
+        AdditionalLoggingUtils.__init__(self)

        if premium_user is not True:
            raise ValueError(
@ -150,11 +155,16 @@ class GCSBucketLogger(GCSBucketBase):
        """
        Get the object name to use for the current payload
        """
-        current_date = datetime.now().strftime("%Y-%m-%d")
+        current_date = self._get_object_date_from_datetime(datetime.now(timezone.utc))
        if logging_payload.get("error_str", None) is not None:
-            object_name = f"{current_date}/failure-{uuid.uuid4().hex}"
+            object_name = self._generate_failure_object_name(
+                request_date_str=current_date,
+            )
        else:
-            object_name = f"{current_date}/{response_obj.get('id', '')}"
+            object_name = self._generate_success_object_name(
+                request_date_str=current_date,
+                response_id=response_obj.get("id", ""),
+            )

        # used for testing
        _litellm_params = kwargs.get("litellm_params", None) or {}
@ -163,3 +173,65 @@ class GCSBucketLogger(GCSBucketBase):
            object_name = _metadata["gcs_log_id"]

        return object_name
+
+    async def get_request_response_payload(
+        self,
+        request_id: str,
+        start_time_utc: Optional[datetime],
+        end_time_utc: Optional[datetime],
+    ) -> Optional[dict]:
+        """
+        Get the request and response payload for a given `request_id`
+        Tries current day, next day, and previous day until it finds the payload
+        """
+        if start_time_utc is None:
+            raise ValueError(
+                "start_time_utc is required for getting a payload from GCS Bucket"
+            )
+
+        # Try current day, next day, and previous day
+        dates_to_try = [
+            start_time_utc,
+            start_time_utc + timedelta(days=1),
+            start_time_utc - timedelta(days=1),
+        ]
+        date_str = None
+        for date in dates_to_try:
+            try:
+                date_str = self._get_object_date_from_datetime(datetime_obj=date)
+                object_name = self._generate_success_object_name(
+                    request_date_str=date_str,
+                    response_id=request_id,
+                )
+                encoded_object_name = quote(object_name, safe="")
+                response = await self.download_gcs_object(encoded_object_name)
+
+                if response is not None:
+                    loaded_response = json.loads(response)
+                    return loaded_response
+            except Exception as e:
+                verbose_logger.debug(
+                    f"Failed to fetch payload for date {date_str}: {str(e)}"
+                )
+                continue
+
+        return None
+
+    def _generate_success_object_name(
+        self,
+        request_date_str: str,
+        response_id: str,
+    ) -> str:
+        return f"{request_date_str}/{response_id}"
+
+    def _generate_failure_object_name(
+        self,
+        request_date_str: str,
+    ) -> str:
+        return f"{request_date_str}/failure-{uuid.uuid4().hex}"
+
+    def _get_object_date_from_datetime(self, datetime_obj: datetime) -> str:
+        return datetime_obj.strftime("%Y-%m-%d")
+
+    async def async_health_check(self) -> IntegrationHealthCheckStatus:
+        raise NotImplementedError("GCS Bucket does not support health check")
--- a/litellm/integrations/langfuse/langfuse.py
+++ b/litellm/integrations/langfuse/langfuse.py
@ -3,7 +3,8 @@
 import copy
 import os
 import traceback
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast

 from packaging.version import Version

@ -13,9 +14,16 @@ from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
 from litellm.llms.custom_httpx.http_handler import _get_httpx_client
 from litellm.secret_managers.main import str_to_bool
 from litellm.types.integrations.langfuse import *
+from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.utils import (
+    EmbeddingResponse,
+    ImageResponse,
+    ModelResponse,
+    RerankResponse,
    StandardLoggingPayload,
    StandardLoggingPromptManagementMetadata,
+    TextCompletionResponse,
+    TranscriptionResponse,
 )

 if TYPE_CHECKING:
@ -150,19 +158,29 @@ class LangFuseLogger:

        return metadata

-    def _old_log_event(  # noqa: PLR0915
+    def log_event_on_langfuse(
        self,
-        kwargs,
-        response_obj,
-        start_time,
-        end_time,
-        user_id,
-        print_verbose,
-        level="DEFAULT",
-        status_message=None,
+        kwargs: dict,
+        response_obj: Union[
+            None,
+            dict,
+            EmbeddingResponse,
+            ModelResponse,
+            TextCompletionResponse,
+            ImageResponse,
+            TranscriptionResponse,
+            RerankResponse,
+            HttpxBinaryResponseContent,
+        ],
+        start_time: Optional[datetime] = None,
+        end_time: Optional[datetime] = None,
+        user_id: Optional[str] = None,
+        level: str = "DEFAULT",
+        status_message: Optional[str] = None,
    ) -> dict:
-        # Method definition
-
+        """
+        Logs a success or error event on Langfuse
+        """
        try:
            verbose_logger.debug(
                f"Langfuse Logging - Enters logging function for model {kwargs}"
@ -198,66 +216,13 @@ class LangFuseLogger:
                        # if casting value to str fails don't block logging
                        pass

-            # end of processing langfuse ########################
-            if (
-                level == "ERROR"
-                and status_message is not None
-                and isinstance(status_message, str)
-            ):
-                input = prompt
-                output = status_message
-            elif response_obj is not None and (
-                kwargs.get("call_type", None) == "embedding"
-                or isinstance(response_obj, litellm.EmbeddingResponse)
-            ):
-                input = prompt
-                output = None
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.ModelResponse
-            ):
-                input = prompt
-                output = response_obj["choices"][0]["message"].json()
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.HttpxBinaryResponseContent
-            ):
-                input = prompt
-                output = "speech-output"
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.TextCompletionResponse
-            ):
-                input = prompt
-                output = response_obj.choices[0].text
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.ImageResponse
-            ):
-                input = prompt
-                output = response_obj["data"]
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.TranscriptionResponse
-            ):
-                input = prompt
-                output = response_obj["text"]
-            elif response_obj is not None and isinstance(
-                response_obj, litellm.RerankResponse
-            ):
-                input = prompt
-                output = response_obj.results
-            elif (
-                kwargs.get("call_type") is not None
-                and kwargs.get("call_type") == "_arealtime"
-                and response_obj is not None
-                and isinstance(response_obj, list)
-            ):
-                input = kwargs.get("input")
-                output = response_obj
-            elif (
-                kwargs.get("call_type") is not None
-                and kwargs.get("call_type") == "pass_through_endpoint"
-                and response_obj is not None
-                and isinstance(response_obj, dict)
-            ):
-                input = prompt
-                output = response_obj.get("response", "")
+            input, output = self._get_langfuse_input_output_content(
+                kwargs=kwargs,
+                response_obj=response_obj,
+                prompt=prompt,
+                level=level,
+                status_message=status_message,
+            )
            verbose_logger.debug(
                f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}"
            )
@ -265,31 +230,30 @@ class LangFuseLogger:
            generation_id = None
            if self._is_langfuse_v2():
                trace_id, generation_id = self._log_langfuse_v2(
-                    user_id,
-                    metadata,
-                    litellm_params,
-                    output,
-                    start_time,
-                    end_time,
-                    kwargs,
-                    optional_params,
-                    input,
-                    response_obj,
-                    level,
-                    print_verbose,
-                    litellm_call_id,
+                    user_id=user_id,
+                    metadata=metadata,
+                    litellm_params=litellm_params,
+                    output=output,
+                    start_time=start_time,
+                    end_time=end_time,
+                    kwargs=kwargs,
+                    optional_params=optional_params,
+                    input=input,
+                    response_obj=response_obj,
+                    level=level,
+                    litellm_call_id=litellm_call_id,
                )
            elif response_obj is not None:
                self._log_langfuse_v1(
-                    user_id,
-                    metadata,
-                    output,
-                    start_time,
-                    end_time,
-                    kwargs,
-                    optional_params,
-                    input,
-                    response_obj,
+                    user_id=user_id,
+                    metadata=metadata,
+                    output=output,
+                    start_time=start_time,
+                    end_time=end_time,
+                    kwargs=kwargs,
+                    optional_params=optional_params,
+                    input=input,
+                    response_obj=response_obj,
                )
            verbose_logger.debug(
                f"Langfuse Layer Logging - final response object: {response_obj}"
@ -303,11 +267,108 @@ class LangFuseLogger:
            )
            return {"trace_id": None, "generation_id": None}

+    def _get_langfuse_input_output_content(
+        self,
+        kwargs: dict,
+        response_obj: Union[
+            None,
+            dict,
+            EmbeddingResponse,
+            ModelResponse,
+            TextCompletionResponse,
+            ImageResponse,
+            TranscriptionResponse,
+            RerankResponse,
+            HttpxBinaryResponseContent,
+        ],
+        prompt: dict,
+        level: str,
+        status_message: Optional[str],
+    ) -> Tuple[Optional[dict], Optional[Union[str, dict, list]]]:
+        """
+        Get the input and output content for Langfuse logging
+
+        Args:
+            kwargs: The keyword arguments passed to the function
+            response_obj: The response object returned by the function
+            prompt: The prompt used to generate the response
+            level: The level of the log message
+            status_message: The status message of the log message
+
+        Returns:
+            input: The input content for Langfuse logging
+            output: The output content for Langfuse logging
+        """
+        input = None
+        output: Optional[Union[str, dict, List[Any]]] = None
+        if (
+            level == "ERROR"
+            and status_message is not None
+            and isinstance(status_message, str)
+        ):
+            input = prompt
+            output = status_message
+        elif response_obj is not None and (
+            kwargs.get("call_type", None) == "embedding"
+            or isinstance(response_obj, litellm.EmbeddingResponse)
+        ):
+            input = prompt
+            output = None
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.ModelResponse
+        ):
+            input = prompt
+            output = self._get_chat_content_for_langfuse(response_obj)
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.HttpxBinaryResponseContent
+        ):
+            input = prompt
+            output = "speech-output"
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.TextCompletionResponse
+        ):
+            input = prompt
+            output = self._get_text_completion_content_for_langfuse(response_obj)
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.ImageResponse
+        ):
+            input = prompt
+            output = response_obj.get("data", None)
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.TranscriptionResponse
+        ):
+            input = prompt
+            output = response_obj.get("text", None)
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.RerankResponse
+        ):
+            input = prompt
+            output = response_obj.results
+        elif (
+            kwargs.get("call_type") is not None
+            and kwargs.get("call_type") == "_arealtime"
+            and response_obj is not None
+            and isinstance(response_obj, list)
+        ):
+            input = kwargs.get("input")
+            output = response_obj
+        elif (
+            kwargs.get("call_type") is not None
+            and kwargs.get("call_type") == "pass_through_endpoint"
+            and response_obj is not None
+            and isinstance(response_obj, dict)
+        ):
+            input = prompt
+            output = response_obj.get("response", "")
+        return input, output
+
    async def _async_log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self, kwargs, response_obj, start_time, end_time, user_id
    ):
        """
-        TODO: support async calls when langfuse is truly async
+        Langfuse SDK uses a background thread to log events
+
+        This approach does not impact latency and runs in the background
        """

    def _is_langfuse_v2(self):
@ -361,19 +422,18 @@ class LangFuseLogger:

    def _log_langfuse_v2(  # noqa: PLR0915
        self,
-        user_id,
-        metadata,
-        litellm_params,
-        output,
-        start_time,
-        end_time,
-        kwargs,
-        optional_params,
-        input,
+        user_id: Optional[str],
+        metadata: dict,
+        litellm_params: dict,
+        output: Optional[Union[str, dict, list]],
+        start_time: Optional[datetime],
+        end_time: Optional[datetime],
+        kwargs: dict,
+        optional_params: dict,
+        input: Optional[dict],
        response_obj,
-        level,
-        print_verbose,
-        litellm_call_id,
+        level: str,
+        litellm_call_id: Optional[str],
    ) -> tuple:
        verbose_logger.debug("Langfuse Layer Logging - logging to langfuse v2")

@ -657,6 +717,31 @@ class LangFuseLogger:
            verbose_logger.error(f"Langfuse Layer Error - {traceback.format_exc()}")
            return None, None

+    @staticmethod
+    def _get_chat_content_for_langfuse(
+        response_obj: ModelResponse,
+    ):
+        """
+        Get the chat content for Langfuse logging
+        """
+        if response_obj.choices and len(response_obj.choices) > 0:
+            output = response_obj["choices"][0]["message"].json()
+            return output
+        else:
+            return None
+
+    @staticmethod
+    def _get_text_completion_content_for_langfuse(
+        response_obj: TextCompletionResponse,
+    ):
+        """
+        Get the text completion content for Langfuse logging
+        """
+        if response_obj.choices and len(response_obj.choices) > 0:
+            return response_obj.choices[0].text
+        else:
+            return None
+
    @staticmethod
    def _get_langfuse_tags(
        standard_logging_object: Optional[StandardLoggingPayload],
--- a/litellm/integrations/langfuse/langfuse_prompt_management.py
+++ b/litellm/integrations/langfuse/langfuse_prompt_management.py
@ -247,13 +247,12 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
            standard_callback_dynamic_params=standard_callback_dynamic_params,
            in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
        )
-        langfuse_logger_to_use._old_log_event(
+        langfuse_logger_to_use.log_event_on_langfuse(
            kwargs=kwargs,
            response_obj=response_obj,
            start_time=start_time,
            end_time=end_time,
            user_id=kwargs.get("user", None),
-            print_verbose=None,
        )

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
@ -271,12 +270,11 @@ class LangfusePromptManagement(LangFuseLogger, PromptManagementBase, CustomLogge
        )
        if standard_logging_object is None:
            return
-        langfuse_logger_to_use._old_log_event(
+        langfuse_logger_to_use.log_event_on_langfuse(
            start_time=start_time,
            end_time=end_time,
            response_obj=None,
            user_id=kwargs.get("user", None),
-            print_verbose=None,
            status_message=standard_logging_object["error_str"],
            level="ERROR",
            kwargs=kwargs,
--- a/litellm/integrations/pagerduty/pagerduty.py
+++ b/litellm/integrations/pagerduty/pagerduty.py
@ -118,6 +118,7 @@ class PagerDutyAlerting(SlackAlerting):
                user_api_key_user_id=_meta.get("user_api_key_user_id"),
                user_api_key_team_alias=_meta.get("user_api_key_team_alias"),
                user_api_key_end_user_id=_meta.get("user_api_key_end_user_id"),
+                user_api_key_user_email=_meta.get("user_api_key_user_email"),
            )
        )

@ -195,6 +196,7 @@ class PagerDutyAlerting(SlackAlerting):
                user_api_key_user_id=user_api_key_dict.user_id,
                user_api_key_team_alias=user_api_key_dict.team_alias,
                user_api_key_end_user_id=user_api_key_dict.end_user_id,
+                user_api_key_user_email=user_api_key_dict.user_email,
            )
        )

--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@ -423,6 +423,7 @@ class PrometheusLogger(CustomLogger):
            team=user_api_team,
            team_alias=user_api_team_alias,
            user=user_id,
+            user_email=standard_logging_payload["metadata"]["user_api_key_user_email"],
            status_code="200",
            model=model,
            litellm_model_name=model,
@ -806,6 +807,7 @@ class PrometheusLogger(CustomLogger):
            enum_values = UserAPIKeyLabelValues(
                end_user=user_api_key_dict.end_user_id,
                user=user_api_key_dict.user_id,
+                user_email=user_api_key_dict.user_email,
                hashed_api_key=user_api_key_dict.api_key,
                api_key_alias=user_api_key_dict.key_alias,
                team=user_api_key_dict.team_id,
@ -853,6 +855,7 @@ class PrometheusLogger(CustomLogger):
                team=user_api_key_dict.team_id,
                team_alias=user_api_key_dict.team_alias,
                user=user_api_key_dict.user_id,
+                user_email=user_api_key_dict.user_email,
                status_code="200",
            )
            _labels = prometheus_label_factory(
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -199,6 +199,7 @@ class Logging(LiteLLMLoggingBaseClass):
        dynamic_async_failure_callbacks: Optional[
            List[Union[str, Callable, CustomLogger]]
        ] = None,
+        applied_guardrails: Optional[List[str]] = None,
        kwargs: Optional[Dict] = None,
    ):
        _input: Optional[str] = messages  # save original value of messages
@ -271,6 +272,7 @@ class Logging(LiteLLMLoggingBaseClass):
            "litellm_call_id": litellm_call_id,
            "input": _input,
            "litellm_params": litellm_params,
+            "applied_guardrails": applied_guardrails,
        }

    def process_dynamic_callbacks(self):
@ -1247,13 +1249,12 @@ class Logging(LiteLLMLoggingBaseClass):
                            in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
                        )
                        if langfuse_logger_to_use is not None:
-                            _response = langfuse_logger_to_use._old_log_event(
+                            _response = langfuse_logger_to_use.log_event_on_langfuse(
                                kwargs=kwargs,
                                response_obj=result,
                                start_time=start_time,
                                end_time=end_time,
                                user_id=kwargs.get("user", None),
-                                print_verbose=print_verbose,
                            )
                            if _response is not None and isinstance(_response, dict):
                                _trace_id = _response.get("trace_id", None)
@ -1957,12 +1958,11 @@ class Logging(LiteLLMLoggingBaseClass):
                            standard_callback_dynamic_params=self.standard_callback_dynamic_params,
                            in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache,
                        )
-                        _response = langfuse_logger_to_use._old_log_event(
+                        _response = langfuse_logger_to_use.log_event_on_langfuse(
                            start_time=start_time,
                            end_time=end_time,
                            response_obj=None,
                            user_id=kwargs.get("user", None),
-                            print_verbose=print_verbose,
                            status_message=str(exception),
                            level="ERROR",
                            kwargs=self.model_call_details,
@ -2854,6 +2854,7 @@ class StandardLoggingPayloadSetup:
        metadata: Optional[Dict[str, Any]],
        litellm_params: Optional[dict] = None,
        prompt_integration: Optional[str] = None,
+        applied_guardrails: Optional[List[str]] = None,
    ) -> StandardLoggingMetadata:
        """
        Clean and filter the metadata dictionary to include only the specified keys in StandardLoggingMetadata.
@ -2868,6 +2869,7 @@ class StandardLoggingPayloadSetup:
            - If the input metadata is None or not a dictionary, an empty StandardLoggingMetadata object is returned.
            - If 'user_api_key' is present in metadata and is a valid SHA256 hash, it's stored as 'user_api_key_hash'.
        """
+
        prompt_management_metadata: Optional[
            StandardLoggingPromptManagementMetadata
        ] = None
@ -2892,11 +2894,13 @@ class StandardLoggingPayloadSetup:
            user_api_key_org_id=None,
            user_api_key_user_id=None,
            user_api_key_team_alias=None,
+            user_api_key_user_email=None,
            spend_logs_metadata=None,
            requester_ip_address=None,
            requester_metadata=None,
            user_api_key_end_user_id=None,
            prompt_management_metadata=prompt_management_metadata,
+            applied_guardrails=applied_guardrails,
        )
        if isinstance(metadata, dict):
            # Filter the metadata dictionary to include only the specified keys
@ -3195,6 +3199,7 @@ def get_standard_logging_object_payload(
            metadata=metadata,
            litellm_params=litellm_params,
            prompt_integration=kwargs.get("prompt_integration", None),
+            applied_guardrails=kwargs.get("applied_guardrails", None),
        )

        _request_body = proxy_server_request.get("body", {})
@ -3324,12 +3329,14 @@ def get_standard_logging_metadata(
        user_api_key_team_id=None,
        user_api_key_org_id=None,
        user_api_key_user_id=None,
+        user_api_key_user_email=None,
        user_api_key_team_alias=None,
        spend_logs_metadata=None,
        requester_ip_address=None,
        requester_metadata=None,
        user_api_key_end_user_id=None,
        prompt_management_metadata=None,
+        applied_guardrails=None,
    )
    if isinstance(metadata, dict):
        # Filter the metadata dictionary to include only the specified keys
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -3,7 +3,8 @@ import json
 import time
 import traceback
 import uuid
-from typing import Dict, Iterable, List, Literal, Optional, Union
+import re
+from typing import Dict, Iterable, List, Literal, Optional, Union, Tuple

 import litellm
 from litellm._logging import verbose_logger
@ -220,6 +221,16 @@ def _handle_invalid_parallel_tool_calls(
        # if there is a JSONDecodeError, return the original tool_calls
        return tool_calls

+def _parse_content_for_reasoning(message_text: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
+    if not message_text:
+        return None, None
+    
+    reasoning_match = re.match(r"<think>(.*?)</think>(.*)", message_text, re.DOTALL)
+
+    if reasoning_match:
+        return reasoning_match.group(1), reasoning_match.group(2)
+    
+    return None, message_text

 class LiteLLMResponseObjectHandler:

@ -432,8 +443,14 @@ def convert_to_model_response_object(  # noqa: PLR0915
                    for field in choice["message"].keys():
                        if field not in message_keys:
                            provider_specific_fields[field] = choice["message"][field]
+
+                    # Handle reasoning models that display `reasoning_content` within `content`
+                    reasoning_content, content = _parse_content_for_reasoning(choice["message"].get("content", None))
+                    if reasoning_content:
+                        provider_specific_fields["reasoning_content"] = reasoning_content
+
                    message = Message(
-                        content=choice["message"].get("content", None),
+                        content=content,
                        role=choice["message"]["role"] or "assistant",
                        function_call=choice["message"].get("function_call", None),
                        tool_calls=tool_calls,
--- a/litellm/litellm_core_utils/logging_callback_manager.py
+++ b/litellm/litellm_core_utils/logging_callback_manager.py
@ -1,7 +1,8 @@
-from typing import Callable, List, Union
+from typing import Callable, List, Set, Union

 import litellm
 from litellm._logging import verbose_logger
+from litellm.integrations.additional_logging_utils import AdditionalLoggingUtils
 from litellm.integrations.custom_logger import CustomLogger


@ -85,6 +86,21 @@ class LoggingCallbackManager:
            callback=callback, parent_list=litellm._async_failure_callback
        )

+    def remove_callback_from_list_by_object(
+            self, callback_list, obj
+    ):
+        """
+        Remove callbacks that are methods of a particular object (e.g., router cleanup)
+        """
+        if not isinstance(callback_list, list): # Not list -> do nothing
+            return
+        
+        remove_list=[c for c in callback_list if hasattr(c, '__self__') and c.__self__ == obj]
+
+        for c in remove_list:
+            callback_list.remove(c)
+
+
    def _add_string_callback_to_list(
        self, callback: str, parent_list: List[Union[CustomLogger, Callable, str]]
    ):
@ -205,3 +221,36 @@ class LoggingCallbackManager:
        litellm._async_success_callback = []
        litellm._async_failure_callback = []
        litellm.callbacks = []
+
+    def _get_all_callbacks(self) -> List[Union[CustomLogger, Callable, str]]:
+        """
+        Get all callbacks from litellm.callbacks, litellm.success_callback, litellm.failure_callback, litellm._async_success_callback, litellm._async_failure_callback
+        """
+        return (
+            litellm.callbacks
+            + litellm.success_callback
+            + litellm.failure_callback
+            + litellm._async_success_callback
+            + litellm._async_failure_callback
+        )
+
+    def get_active_additional_logging_utils_from_custom_logger(
+        self,
+    ) -> Set[AdditionalLoggingUtils]:
+        """
+        Get all custom loggers that are instances of the given class type
+
+        Args:
+            class_type: The class type to match against (e.g., AdditionalLoggingUtils)
+
+        Returns:
+            Set[CustomLogger]: Set of custom loggers that are instances of the given class type
+        """
+        all_callbacks = self._get_all_callbacks()
+        matched_callbacks: Set[AdditionalLoggingUtils] = set()
+        for callback in all_callbacks:
+            if isinstance(callback, CustomLogger) and isinstance(
+                callback, AdditionalLoggingUtils
+            ):
+                matched_callbacks.add(callback)
+        return matched_callbacks
--- a/litellm/litellm_core_utils/prompt_templates/factory.py
+++ b/litellm/litellm_core_utils/prompt_templates/factory.py
@ -1421,6 +1421,8 @@ def anthropic_messages_pt(  # noqa: PLR0915
                            )

                            user_content.append(_content_element)
+                        elif m.get("type", "") == "document":
+                            user_content.append(cast(AnthropicMessagesDocumentParam, m))
                elif isinstance(user_message_types_block["content"], str):
                    _anthropic_content_text_element: AnthropicMessagesTextParam = {
                        "type": "text",
--- a/litellm/litellm_core_utils/sensitive_data_masker.py
+++ b/litellm/litellm_core_utils/sensitive_data_masker.py
@ -0,0 +1,81 @@
+from typing import Any, Dict, Optional, Set
+
+
+class SensitiveDataMasker:
+    def __init__(
+        self,
+        sensitive_patterns: Optional[Set[str]] = None,
+        visible_prefix: int = 4,
+        visible_suffix: int = 4,
+        mask_char: str = "*",
+    ):
+        self.sensitive_patterns = sensitive_patterns or {
+            "password",
+            "secret",
+            "key",
+            "token",
+            "auth",
+            "credential",
+            "access",
+            "private",
+            "certificate",
+        }
+
+        self.visible_prefix = visible_prefix
+        self.visible_suffix = visible_suffix
+        self.mask_char = mask_char
+
+    def _mask_value(self, value: str) -> str:
+        if not value or len(str(value)) < (self.visible_prefix + self.visible_suffix):
+            return value
+
+        value_str = str(value)
+        masked_length = len(value_str) - (self.visible_prefix + self.visible_suffix)
+        return f"{value_str[:self.visible_prefix]}{self.mask_char * masked_length}{value_str[-self.visible_suffix:]}"
+
+    def is_sensitive_key(self, key: str) -> bool:
+        key_lower = str(key).lower()
+        result = any(pattern in key_lower for pattern in self.sensitive_patterns)
+        return result
+
+    def mask_dict(
+        self, data: Dict[str, Any], depth: int = 0, max_depth: int = 10
+    ) -> Dict[str, Any]:
+        if depth >= max_depth:
+            return data
+
+        masked_data: Dict[str, Any] = {}
+        for k, v in data.items():
+            try:
+                if isinstance(v, dict):
+                    masked_data[k] = self.mask_dict(v, depth + 1)
+                elif hasattr(v, "__dict__") and not isinstance(v, type):
+                    masked_data[k] = self.mask_dict(vars(v), depth + 1)
+                elif self.is_sensitive_key(k):
+                    str_value = str(v) if v is not None else ""
+                    masked_data[k] = self._mask_value(str_value)
+                else:
+                    masked_data[k] = (
+                        v if isinstance(v, (int, float, bool, str)) else str(v)
+                    )
+            except Exception:
+                masked_data[k] = "<unable to serialize>"
+
+        return masked_data
+
+
+# Usage example:
+"""
+masker = SensitiveDataMasker()
+data = {
+    "api_key": "sk-1234567890abcdef",
+    "redis_password": "very_secret_pass",
+    "port": 6379
+}
+masked = masker.mask_dict(data)
+# Result: {
+#    "api_key": "sk-1****cdef",
+#    "redis_password": "very****pass",
+#    "port": 6379
+# }
+"""
--- a/litellm/litellm_core_utils/streaming_handler.py
+++ b/litellm/litellm_core_utils/streaming_handler.py
@ -809,7 +809,10 @@ class CustomStreamWrapper:
                    if self.sent_first_chunk is False:
                        completion_obj["role"] = "assistant"
                        self.sent_first_chunk = True
-
+                    if response_obj.get("provider_specific_fields") is not None:
+                        completion_obj["provider_specific_fields"] = response_obj[
+                            "provider_specific_fields"
+                        ]
                    model_response.choices[0].delta = Delta(**completion_obj)
                    _index: Optional[int] = completion_obj.get("index")
                    if _index is not None:
--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -4,7 +4,7 @@ Calling + translation logic for anthropic's `/v1/messages` endpoint

 import copy
 import json
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import httpx  # type: ignore

@ -506,6 +506,29 @@ class ModelResponseIterator:

        return usage_block

+    def _content_block_delta_helper(self, chunk: dict):
+        text = ""
+        tool_use: Optional[ChatCompletionToolCallChunk] = None
+        provider_specific_fields = {}
+        content_block = ContentBlockDelta(**chunk)  # type: ignore
+        self.content_blocks.append(content_block)
+        if "text" in content_block["delta"]:
+            text = content_block["delta"]["text"]
+        elif "partial_json" in content_block["delta"]:
+            tool_use = {
+                "id": None,
+                "type": "function",
+                "function": {
+                    "name": None,
+                    "arguments": content_block["delta"]["partial_json"],
+                },
+                "index": self.tool_index,
+            }
+        elif "citation" in content_block["delta"]:
+            provider_specific_fields["citation"] = content_block["delta"]["citation"]
+
+        return text, tool_use, provider_specific_fields
+
    def chunk_parser(self, chunk: dict) -> GenericStreamingChunk:
        try:
            type_chunk = chunk.get("type", "") or ""
@ -515,6 +538,7 @@ class ModelResponseIterator:
            is_finished = False
            finish_reason = ""
            usage: Optional[ChatCompletionUsageBlock] = None
+            provider_specific_fields: Dict[str, Any] = {}

            index = int(chunk.get("index", 0))
            if type_chunk == "content_block_delta":
@ -522,20 +546,9 @@ class ModelResponseIterator:
                Anthropic content chunk
                chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
                """
-                content_block = ContentBlockDelta(**chunk)  # type: ignore
-                self.content_blocks.append(content_block)
-                if "text" in content_block["delta"]:
-                    text = content_block["delta"]["text"]
-                elif "partial_json" in content_block["delta"]:
-                    tool_use = {
-                        "id": None,
-                        "type": "function",
-                        "function": {
-                            "name": None,
-                            "arguments": content_block["delta"]["partial_json"],
-                        },
-                        "index": self.tool_index,
-                    }
+                text, tool_use, provider_specific_fields = (
+                    self._content_block_delta_helper(chunk=chunk)
+                )
            elif type_chunk == "content_block_start":
                """
                event: content_block_start
@ -628,6 +641,9 @@ class ModelResponseIterator:
                finish_reason=finish_reason,
                usage=usage,
                index=index,
+                provider_specific_fields=(
+                    provider_specific_fields if provider_specific_fields else None
+                ),
            )

            return returned_chunk
--- a/litellm/llms/anthropic/chat/transformation.py
+++ b/litellm/llms/anthropic/chat/transformation.py
@ -70,7 +70,7 @@ class AnthropicConfig(BaseConfig):
        metadata: Optional[dict] = None,
        system: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
@ -628,6 +628,7 @@ class AnthropicConfig(BaseConfig):
            )
        else:
            text_content = ""
+            citations: List[Any] = []
            tool_calls: List[ChatCompletionToolCallChunk] = []
            for idx, content in enumerate(completion_response["content"]):
                if content["type"] == "text":
@ -645,10 +646,14 @@ class AnthropicConfig(BaseConfig):
                            index=idx,
                        )
                    )
+                ## CITATIONS
+                if content.get("citations", None) is not None:
+                    citations.append(content["citations"])

            _message = litellm.Message(
                tool_calls=tool_calls,
                content=text_content or None,
+                provider_specific_fields={"citations": citations},
            )

            ## HANDLE JSON MODE - anthropic returns single function call
--- a/litellm/llms/anthropic/completion/transformation.py
+++ b/litellm/llms/anthropic/completion/transformation.py
@ -72,7 +72,7 @@ class AnthropicTextConfig(BaseConfig):
        top_k: Optional[int] = None,
        metadata: Optional[dict] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/azure/azure.py
+++ b/litellm/llms/azure/azure.py
@ -5,10 +5,11 @@ import time
 from typing import Any, Callable, Dict, List, Literal, Optional, Union

 import httpx  # type: ignore
-from openai import AsyncAzureOpenAI, AzureOpenAI
+from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI

 import litellm
 from litellm.caching.caching import DualCache
+from litellm.constants import DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
@ -98,14 +99,6 @@ class AzureOpenAIAssistantsAPIConfig:


 def select_azure_base_url_or_endpoint(azure_client_params: dict):
-    # azure_client_params = {
-    #     "api_version": api_version,
-    #     "azure_endpoint": api_base,
-    #     "azure_deployment": model,
-    #     "http_client": litellm.client_session,
-    #     "max_retries": max_retries,
-    #     "timeout": timeout,
-    # }
    azure_endpoint = azure_client_params.get("azure_endpoint", None)
    if azure_endpoint is not None:
        # see : https://github.com/openai/openai-python/blob/3d61ed42aba652b547029095a7eb269ad4e1e957/src/openai/lib/azure.py#L192
@ -312,6 +305,7 @@ class AzureChatCompletion(BaseLLM):
        - call chat.completions.create.with_raw_response when litellm.return_response_headers is True
        - call chat.completions.create by default
        """
+        start_time = time.time()
        try:
            raw_response = await azure_client.chat.completions.with_raw_response.create(
                **data, timeout=timeout
@ -320,6 +314,11 @@ class AzureChatCompletion(BaseLLM):
            headers = dict(raw_response.headers)
            response = raw_response.parse()
            return headers, response
+        except APITimeoutError as e:
+            end_time = time.time()
+            time_delta = round(end_time - start_time, 2)
+            e.message += f" - timeout value={timeout}, time taken={time_delta} seconds"
+            raise e
        except Exception as e:
            raise e

@ -353,7 +352,9 @@ class AzureChatCompletion(BaseLLM):
                    status_code=422, message="Missing model or messages"
                )

-            max_retries = optional_params.pop("max_retries", 2)
+            max_retries = optional_params.pop("max_retries", None)
+            if max_retries is None:
+                max_retries = DEFAULT_MAX_RETRIES
            json_mode: Optional[bool] = optional_params.pop("json_mode", False)

            ### CHECK IF CLOUDFLARE AI GATEWAY ###
@ -415,6 +416,7 @@ class AzureChatCompletion(BaseLLM):
                        azure_ad_token_provider=azure_ad_token_provider,
                        timeout=timeout,
                        client=client,
+                        max_retries=max_retries,
                    )
                else:
                    return self.acompletion(
@ -430,6 +432,7 @@ class AzureChatCompletion(BaseLLM):
                        timeout=timeout,
                        client=client,
                        logging_obj=logging_obj,
+                        max_retries=max_retries,
                        convert_tool_call_to_json_mode=json_mode,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
@ -445,6 +448,7 @@ class AzureChatCompletion(BaseLLM):
                    azure_ad_token_provider=azure_ad_token_provider,
                    timeout=timeout,
                    client=client,
+                    max_retries=max_retries,
                )
            else:
                ## LOGGING
@ -553,6 +557,7 @@ class AzureChatCompletion(BaseLLM):
        dynamic_params: bool,
        model_response: ModelResponse,
        logging_obj: LiteLLMLoggingObj,
+        max_retries: int,
        azure_ad_token: Optional[str] = None,
        azure_ad_token_provider: Optional[Callable] = None,
        convert_tool_call_to_json_mode: Optional[bool] = None,
@ -560,12 +565,6 @@ class AzureChatCompletion(BaseLLM):
    ):
        response = None
        try:
-            max_retries = data.pop("max_retries", 2)
-            if not isinstance(max_retries, int):
-                raise AzureOpenAIError(
-                    status_code=422, message="max retries must be an int"
-                )
-
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
@ -649,6 +648,7 @@ class AzureChatCompletion(BaseLLM):
            )
            raise AzureOpenAIError(status_code=500, message=str(e))
        except Exception as e:
+            message = getattr(e, "message", str(e))
            ## LOGGING
            logging_obj.post_call(
                input=data["messages"],
@ -659,7 +659,7 @@ class AzureChatCompletion(BaseLLM):
            if hasattr(e, "status_code"):
                raise e
            else:
-                raise AzureOpenAIError(status_code=500, message=str(e))
+                raise AzureOpenAIError(status_code=500, message=message)

    def streaming(
        self,
@ -671,15 +671,11 @@ class AzureChatCompletion(BaseLLM):
        data: dict,
        model: str,
        timeout: Any,
+        max_retries: int,
        azure_ad_token: Optional[str] = None,
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
    ):
-        max_retries = data.pop("max_retries", 2)
-        if not isinstance(max_retries, int):
-            raise AzureOpenAIError(
-                status_code=422, message="max retries must be an int"
-            )
        # init AzureOpenAI Client
        azure_client_params = {
            "api_version": api_version,
@ -742,6 +738,7 @@ class AzureChatCompletion(BaseLLM):
        data: dict,
        model: str,
        timeout: Any,
+        max_retries: int,
        azure_ad_token: Optional[str] = None,
        azure_ad_token_provider: Optional[Callable] = None,
        client=None,
@ -753,7 +750,7 @@ class AzureChatCompletion(BaseLLM):
                "azure_endpoint": api_base,
                "azure_deployment": model,
                "http_client": litellm.aclient_session,
-                "max_retries": data.pop("max_retries", 2),
+                "max_retries": max_retries,
                "timeout": timeout,
            }
            azure_client_params = select_azure_base_url_or_endpoint(
@ -807,10 +804,11 @@ class AzureChatCompletion(BaseLLM):
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            error_response = getattr(e, "response", None)
+            message = getattr(e, "message", str(e))
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
            raise AzureOpenAIError(
-                status_code=status_code, message=str(e), headers=error_headers
+                status_code=status_code, message=message, headers=error_headers
            )

    async def aembedding(
--- a/litellm/llms/azure/chat/gpt_transformation.py
+++ b/litellm/llms/azure/chat/gpt_transformation.py
@ -113,6 +113,17 @@ class AzureOpenAIConfig(BaseConfig):

        return False

+    def _is_response_format_supported_api_version(
+        self, api_version_year: str, api_version_month: str
+    ) -> bool:
+        """
+        - check if api_version is supported for response_format
+        """
+
+        is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8
+
+        return is_supported
+
    def map_openai_params(
        self,
        non_default_params: dict,
@ -171,13 +182,20 @@ class AzureOpenAIConfig(BaseConfig):
                _is_response_format_supported_model = (
                    self._is_response_format_supported_model(model)
                )
-                should_convert_response_format_to_tool = (
-                    api_version_year <= "2024" and api_version_month < "08"
-                ) or not _is_response_format_supported_model
+
+                is_response_format_supported_api_version = (
+                    self._is_response_format_supported_api_version(
+                        api_version_year, api_version_month
+                    )
+                )
+                is_response_format_supported = (
+                    is_response_format_supported_api_version
+                    and _is_response_format_supported_model
+                )
                optional_params = self._add_response_format_to_tools(
                    optional_params=optional_params,
                    value=value,
-                    should_convert_response_format_to_tool=should_convert_response_format_to_tool,
+                    is_response_format_supported=is_response_format_supported,
                )
            elif param == "tools" and isinstance(value, list):
                optional_params.setdefault("tools", [])
--- a/litellm/llms/azure/completion/handler.py
+++ b/litellm/llms/azure/completion/handler.py
@ -131,6 +131,7 @@ class AzureTextCompletion(BaseLLM):
                        timeout=timeout,
                        client=client,
                        logging_obj=logging_obj,
+                        max_retries=max_retries,
                    )
            elif "stream" in optional_params and optional_params["stream"] is True:
                return self.streaming(
@ -236,17 +237,12 @@ class AzureTextCompletion(BaseLLM):
        timeout: Any,
        model_response: ModelResponse,
        logging_obj: Any,
+        max_retries: int,
        azure_ad_token: Optional[str] = None,
        client=None,  # this is the AsyncAzureOpenAI
    ):
        response = None
        try:
-            max_retries = data.pop("max_retries", 2)
-            if not isinstance(max_retries, int):
-                raise AzureOpenAIError(
-                    status_code=422, message="max retries must be an int"
-                )
-
            # init AzureOpenAI Client
            azure_client_params = {
                "api_version": api_version,
--- a/litellm/llms/base_llm/base_utils.py
+++ b/litellm/llms/base_llm/base_utils.py
@ -34,6 +34,17 @@ class BaseLLMModelInfo(ABC):
    def get_api_base(api_base: Optional[str] = None) -> Optional[str]:
        pass

+    @staticmethod
+    @abstractmethod
+    def get_base_model(model: str) -> Optional[str]:
+        """
+        Returns the base model name from the given model name.
+
+        Some providers like bedrock - can receive model=`invoke/anthropic.claude-3-opus-20240229-v1:0` or `converse/anthropic.claude-3-opus-20240229-v1:0`
+            This function will return `anthropic.claude-3-opus-20240229-v1:0`
+        """
+        pass
+

 def _dict_to_response_format_helper(
    response_format: dict, ref_template: Optional[str] = None
--- a/litellm/llms/base_llm/chat/transformation.py
+++ b/litellm/llms/base_llm/chat/transformation.py
@ -20,6 +20,7 @@ from pydantic import BaseModel

 from litellm._logging import verbose_logger
 from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.types.llms.openai import (
    AllMessageValues,
    ChatCompletionToolChoiceFunctionParam,
@ -27,9 +28,6 @@ from litellm.types.llms.openai import (
    ChatCompletionToolParam,
    ChatCompletionToolParamFunctionChunk,
 )
-
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
-
 from litellm.types.utils import ModelResponse
 from litellm.utils import CustomStreamWrapper

@ -163,7 +161,7 @@ class BaseConfig(ABC):
        self,
        optional_params: dict,
        value: dict,
-        should_convert_response_format_to_tool: bool,
+        is_response_format_supported: bool,
    ) -> dict:
        """
        Follow similar approach to anthropic - translate to a single tool call.
@ -183,7 +181,8 @@ class BaseConfig(ABC):
        elif "json_schema" in value:
            json_schema = value["json_schema"]["schema"]

-        if json_schema and should_convert_response_format_to_tool:
+        if json_schema and not is_response_format_supported:
+
            _tool_choice = ChatCompletionToolChoiceObjectParam(
                type="function",
                function=ChatCompletionToolChoiceFunctionParam(
--- a/litellm/llms/bedrock/base_aws_llm.py
+++ b/litellm/llms/bedrock/base_aws_llm.py
@ -52,6 +52,7 @@ class BaseAWSLLM:
            "aws_role_name",
            "aws_web_identity_token",
            "aws_sts_endpoint",
+            "aws_bedrock_runtime_endpoint",
        ]

    def get_cache_key(self, credential_args: Dict[str, Optional[str]]) -> str:
--- a/litellm/llms/bedrock/chat/converse_transformation.py
+++ b/litellm/llms/bedrock/chat/converse_transformation.py
@ -33,14 +33,7 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import ModelResponse, Usage
 from litellm.utils import add_dummy_tool, has_tool_call_blocks

-from ..common_utils import (
-    AmazonBedrockGlobalConfig,
-    BedrockError,
-    get_bedrock_tool_name,
-)
-
-global_config = AmazonBedrockGlobalConfig()
-all_global_regions = global_config.get_all_regions()
+from ..common_utils import BedrockError, BedrockModelInfo, get_bedrock_tool_name


 class AmazonConverseConfig(BaseConfig):
@ -63,7 +56,7 @@ class AmazonConverseConfig(BaseConfig):
        topP: Optional[int] = None,
        topK: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
@ -104,7 +97,7 @@ class AmazonConverseConfig(BaseConfig):
        ]

        ## Filter out 'cross-region' from model name
-        base_model = self._get_base_model(model)
+        base_model = BedrockModelInfo.get_base_model(model)

        if (
            base_model.startswith("anthropic")
@ -341,9 +334,9 @@ class AmazonConverseConfig(BaseConfig):
        if "top_k" in inference_params:
            inference_params["topK"] = inference_params.pop("top_k")
        return InferenceConfig(**inference_params)
-    
+
    def _handle_top_k_value(self, model: str, inference_params: dict) -> dict:
-        base_model = self._get_base_model(model)
+        base_model = BedrockModelInfo.get_base_model(model)

        val_top_k = None
        if "topK" in inference_params:
@ -352,11 +345,11 @@ class AmazonConverseConfig(BaseConfig):
            val_top_k = inference_params.pop("top_k")

        if val_top_k:
-            if (base_model.startswith("anthropic")):
+            if base_model.startswith("anthropic"):
                return {"top_k": val_top_k}
            if base_model.startswith("amazon.nova"):
-                return {'inferenceConfig': {"topK": val_top_k}}                
-                
+                return {"inferenceConfig": {"topK": val_top_k}}
+
        return {}

    def _transform_request_helper(
@ -393,15 +386,25 @@ class AmazonConverseConfig(BaseConfig):
        ) + ["top_k"]
        supported_tool_call_params = ["tools", "tool_choice"]
        supported_guardrail_params = ["guardrailConfig"]
-        total_supported_params = supported_converse_params + supported_tool_call_params + supported_guardrail_params
+        total_supported_params = (
+            supported_converse_params
+            + supported_tool_call_params
+            + supported_guardrail_params
+        )
        inference_params.pop("json_mode", None)  # used for handling json_schema

        # keep supported params in 'inference_params', and set all model-specific params in 'additional_request_params'
-        additional_request_params = {k: v for k, v in inference_params.items() if k not in total_supported_params}
-        inference_params = {k: v for k, v in inference_params.items() if k in total_supported_params}
+        additional_request_params = {
+            k: v for k, v in inference_params.items() if k not in total_supported_params
+        }
+        inference_params = {
+            k: v for k, v in inference_params.items() if k in total_supported_params
+        }

        # Only set the topK value in for models that support it
-        additional_request_params.update(self._handle_top_k_value(model, inference_params))
+        additional_request_params.update(
+            self._handle_top_k_value(model, inference_params)
+        )

        bedrock_tools: List[ToolBlock] = _bedrock_tools_pt(
            inference_params.pop("tools", [])
@ -679,41 +682,6 @@ class AmazonConverseConfig(BaseConfig):

        return model_response

-    def _supported_cross_region_inference_region(self) -> List[str]:
-        """
-        Abbreviations of regions AWS Bedrock supports for cross region inference
-        """
-        return ["us", "eu", "apac"]
-
-    def _get_base_model(self, model: str) -> str:
-        """
-        Get the base model from the given model name.
-
-        Handle model names like - "us.meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
-        AND "meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
-        """
-
-        if model.startswith("bedrock/"):
-            model = model.split("/", 1)[1]
-
-        if model.startswith("converse/"):
-            model = model.split("/", 1)[1]
-
-        potential_region = model.split(".", 1)[0]
-
-        alt_potential_region = model.split("/", 1)[
-            0
-        ]  # in model cost map we store regional information like `/us-west-2/bedrock-model`
-
-        if potential_region in self._supported_cross_region_inference_region():
-            return model.split(".", 1)[1]
-        elif (
-            alt_potential_region in all_global_regions and len(model.split("/", 1)) > 1
-        ):
-            return model.split("/", 1)[1]
-
-        return model
-
    def get_error_class(
        self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers]
    ) -> BaseLLMException:
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -40,6 +40,9 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
    parse_xml_params,
    prompt_factory,
 )
+from litellm.llms.anthropic.chat.handler import (
+    ModelResponseIterator as AnthropicModelResponseIterator,
+)
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
@ -103,7 +106,7 @@ class AmazonCohereChatConfig:
        stop_sequences: Optional[str] = None,
        raw_prompting: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
@ -177,6 +180,7 @@ async def make_call(
    logging_obj: Logging,
    fake_stream: bool = False,
    json_mode: Optional[bool] = False,
+    bedrock_invoke_provider: Optional[litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL] = None,
 ):
    try:
        if client is None:
@ -214,6 +218,14 @@ async def make_call(
            completion_stream: Any = MockResponseIterator(
                model_response=model_response, json_mode=json_mode
            )
+        elif bedrock_invoke_provider == "anthropic":
+            decoder: AWSEventStreamDecoder = AmazonAnthropicClaudeStreamDecoder(
+                model=model,
+                sync_stream=False,
+            )
+            completion_stream = decoder.aiter_bytes(
+                response.aiter_bytes(chunk_size=1024)
+            )
        else:
            decoder = AWSEventStreamDecoder(model=model)
            completion_stream = decoder.aiter_bytes(
@ -248,6 +260,7 @@ def make_sync_call(
    logging_obj: Logging,
    fake_stream: bool = False,
    json_mode: Optional[bool] = False,
+    bedrock_invoke_provider: Optional[litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL] = None,
 ):
    try:
        if client is None:
@ -283,6 +296,12 @@ def make_sync_call(
            completion_stream: Any = MockResponseIterator(
                model_response=model_response, json_mode=json_mode
            )
+        elif bedrock_invoke_provider == "anthropic":
+            decoder: AWSEventStreamDecoder = AmazonAnthropicClaudeStreamDecoder(
+                model=model,
+                sync_stream=True,
+            )
+            completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
        else:
            decoder = AWSEventStreamDecoder(model=model)
            completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024))
@ -1323,7 +1342,7 @@ class AWSEventStreamDecoder:
            text = chunk_data.get("completions")[0].get("data").get("text")  # type: ignore
            is_finished = True
            finish_reason = "stop"
-        ######## bedrock.anthropic mappings ###############
+        ######## /bedrock/converse mappings ###############
        elif (
            "contentBlockIndex" in chunk_data
            or "stopReason" in chunk_data
@ -1331,6 +1350,11 @@ class AWSEventStreamDecoder:
            or "trace" in chunk_data
        ):
            return self.converse_chunk_parser(chunk_data=chunk_data)
+        ######### /bedrock/invoke nova mappings ###############
+        elif "contentBlockDelta" in chunk_data:
+            # when using /bedrock/invoke/nova, the chunk_data is nested under "contentBlockDelta"
+            _chunk_data = chunk_data.get("contentBlockDelta", None)
+            return self.converse_chunk_parser(chunk_data=_chunk_data)
        ######## bedrock.mistral mappings ###############
        elif "outputs" in chunk_data:
            if (
@ -1429,6 +1453,27 @@ class AWSEventStreamDecoder:
            return chunk.decode()  # type: ignore[no-any-return]


+class AmazonAnthropicClaudeStreamDecoder(AWSEventStreamDecoder):
+    def __init__(
+        self,
+        model: str,
+        sync_stream: bool,
+    ) -> None:
+        """
+        Child class of AWSEventStreamDecoder that handles the streaming response from the Anthropic family of models
+
+        The only difference between AWSEventStreamDecoder and AmazonAnthropicClaudeStreamDecoder is the `chunk_parser` method
+        """
+        super().__init__(model=model)
+        self.anthropic_model_response_iterator = AnthropicModelResponseIterator(
+            streaming_response=None,
+            sync_stream=sync_stream,
+        )
+
+    def _chunk_parser(self, chunk_data: dict) -> GChunk:
+        return self.anthropic_model_response_iterator.chunk_parser(chunk=chunk_data)
+
+
 class MockResponseIterator:  # for returning ai21 streaming responses
    def __init__(self, model_response, json_mode: Optional[bool] = False):
        self.model_response = model_response
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_ai21_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_ai21_transformation.py
@ -46,7 +46,7 @@ class AmazonAI21Config(AmazonInvokeConfig, BaseConfig):
        presencePenalty: Optional[dict] = None,
        countPenalty: Optional[dict] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_cohere_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_cohere_transformation.py
@ -28,7 +28,7 @@ class AmazonCohereConfig(AmazonInvokeConfig, BaseConfig):
        temperature: Optional[float] = None,
        return_likelihood: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_llama_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_llama_transformation.py
@ -28,7 +28,7 @@ class AmazonLlamaConfig(AmazonInvokeConfig, BaseConfig):
        temperature: Optional[float] = None,
        topP: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_mistral_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_mistral_transformation.py
@ -33,7 +33,7 @@ class AmazonMistralConfig(AmazonInvokeConfig, BaseConfig):
        top_k: Optional[float] = None,
        stop: Optional[List[str]] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py
@ -0,0 +1,70 @@
+"""
+Handles transforming requests for `bedrock/invoke/{nova} models`
+
+Inherits from `AmazonConverseConfig`
+
+Nova + Invoke API Tutorial: https://docs.aws.amazon.com/nova/latest/userguide/using-invoke-api.html
+"""
+
+from typing import List
+
+import litellm
+from litellm.types.llms.bedrock import BedrockInvokeNovaRequest
+from litellm.types.llms.openai import AllMessageValues
+
+
+class AmazonInvokeNovaConfig(litellm.AmazonConverseConfig):
+    """
+    Config for sending `nova` requests to `/bedrock/invoke/`
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def transform_request(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        headers: dict,
+    ) -> dict:
+        _transformed_nova_request = super().transform_request(
+            model=model,
+            messages=messages,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            headers=headers,
+        )
+        _bedrock_invoke_nova_request = BedrockInvokeNovaRequest(
+            **_transformed_nova_request
+        )
+        self._remove_empty_system_messages(_bedrock_invoke_nova_request)
+        bedrock_invoke_nova_request = self._filter_allowed_fields(
+            _bedrock_invoke_nova_request
+        )
+        return bedrock_invoke_nova_request
+
+    def _filter_allowed_fields(
+        self, bedrock_invoke_nova_request: BedrockInvokeNovaRequest
+    ) -> dict:
+        """
+        Filter out fields that are not allowed in the `BedrockInvokeNovaRequest` dataclass.
+        """
+        allowed_fields = set(BedrockInvokeNovaRequest.__annotations__.keys())
+        return {
+            k: v for k, v in bedrock_invoke_nova_request.items() if k in allowed_fields
+        }
+
+    def _remove_empty_system_messages(
+        self, bedrock_invoke_nova_request: BedrockInvokeNovaRequest
+    ) -> None:
+        """
+        In-place remove empty `system` messages from the request.
+
+        /bedrock/invoke/ does not allow empty `system` messages.
+        """
+        _system_message = bedrock_invoke_nova_request.get("system", None)
+        if isinstance(_system_message, list) and len(_system_message) == 0:
+            bedrock_invoke_nova_request.pop("system", None)
+        return
--- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_titan_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_titan_transformation.py
@ -33,7 +33,7 @@ class AmazonTitanConfig(AmazonInvokeConfig, BaseConfig):
        temperature: Optional[float] = None,
        topP: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude2_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude2_transformation.py
@ -34,7 +34,7 @@ class AmazonAnthropicConfig:
        top_p: Optional[int] = None,
        anthropic_version: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py
@ -1,61 +1,34 @@
-import types
-from typing import List, Optional
+from typing import TYPE_CHECKING, Any, List, Optional
+
+import httpx
+
+import litellm
+from litellm.llms.bedrock.chat.invoke_transformations.base_invoke_transformation import (
+    AmazonInvokeConfig,
+)
+from litellm.types.llms.openai import AllMessageValues
+from litellm.types.utils import ModelResponse
+
+if TYPE_CHECKING:
+    from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
+
+    LiteLLMLoggingObj = _LiteLLMLoggingObj
+else:
+    LiteLLMLoggingObj = Any


-class AmazonAnthropicClaude3Config:
+class AmazonAnthropicClaude3Config(AmazonInvokeConfig):
    """
    Reference:
        https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
        https://docs.anthropic.com/claude/docs/models-overview#model-comparison

    Supported Params for the Amazon / Anthropic Claude 3 models:
-
-    - `max_tokens` Required (integer) max tokens. Default is 4096
-    - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
-    - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
-    - `temperature` Optional (float) The amount of randomness injected into the response
-    - `top_p` Optional (float) Use nucleus sampling.
-    - `top_k` Optional (int) Only sample from the top K options for each subsequent token
-    - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
    """

-    max_tokens: Optional[int] = 4096  # Opus, Sonnet, and Haiku default
-    anthropic_version: Optional[str] = "bedrock-2023-05-31"
-    system: Optional[str] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    stop_sequences: Optional[List[str]] = None
+    anthropic_version: str = "bedrock-2023-05-31"

-    def __init__(
-        self,
-        max_tokens: Optional[int] = None,
-        anthropic_version: Optional[str] = None,
-    ) -> None:
-        locals_ = locals()
-        for key, value in locals_.items():
-            if key != "self" and value is not None:
-                setattr(self.__class__, key, value)
-
-    @classmethod
-    def get_config(cls):
-        return {
-            k: v
-            for k, v in cls.__dict__.items()
-            if not k.startswith("__")
-            and not isinstance(
-                v,
-                (
-                    types.FunctionType,
-                    types.BuiltinFunctionType,
-                    classmethod,
-                    staticmethod,
-                ),
-            )
-            and v is not None
-        }
-
-    def get_supported_openai_params(self):
+    def get_supported_openai_params(self, model: str):
        return [
            "max_tokens",
            "max_completion_tokens",
@ -68,7 +41,13 @@ class AmazonAnthropicClaude3Config:
            "extra_headers",
        ]

-    def map_openai_params(self, non_default_params: dict, optional_params: dict):
+    def map_openai_params(
+        self,
+        non_default_params: dict,
+        optional_params: dict,
+        model: str,
+        drop_params: bool,
+    ):
        for param, value in non_default_params.items():
            if param == "max_tokens" or param == "max_completion_tokens":
                optional_params["max_tokens"] = value
@ -83,3 +62,53 @@ class AmazonAnthropicClaude3Config:
            if param == "top_p":
                optional_params["top_p"] = value
        return optional_params
+
+    def transform_request(
+        self,
+        model: str,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        headers: dict,
+    ) -> dict:
+        _anthropic_request = litellm.AnthropicConfig().transform_request(
+            model=model,
+            messages=messages,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            headers=headers,
+        )
+
+        _anthropic_request.pop("model", None)
+        if "anthropic_version" not in _anthropic_request:
+            _anthropic_request["anthropic_version"] = self.anthropic_version
+
+        return _anthropic_request
+
+    def transform_response(
+        self,
+        model: str,
+        raw_response: httpx.Response,
+        model_response: ModelResponse,
+        logging_obj: LiteLLMLoggingObj,
+        request_data: dict,
+        messages: List[AllMessageValues],
+        optional_params: dict,
+        litellm_params: dict,
+        encoding: Any,
+        api_key: Optional[str] = None,
+        json_mode: Optional[bool] = None,
+    ) -> ModelResponse:
+        return litellm.AnthropicConfig().transform_response(
+            model=model,
+            raw_response=raw_response,
+            model_response=model_response,
+            logging_obj=logging_obj,
+            request_data=request_data,
+            messages=messages,
+            optional_params=optional_params,
+            litellm_params=litellm_params,
+            encoding=encoding,
+            api_key=api_key,
+            json_mode=json_mode,
+        )
--- a/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py
+++ b/litellm/llms/bedrock/chat/invoke_transformations/base_invoke_transformation.py
@ -2,22 +2,18 @@ import copy
 import json
 import time
 import urllib.parse
-import uuid
 from functools import partial
 from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, cast, get_args

 import httpx

 import litellm
+from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.litellm_core_utils.prompt_templates.factory import (
    cohere_message_pt,
-    construct_tool_use_system_prompt,
-    contains_tag,
    custom_prompt,
-    extract_between_tags,
-    parse_xml_params,
    prompt_factory,
 )
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
@ -91,7 +87,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
            optional_params=optional_params,
        )
        ### SET RUNTIME ENDPOINT ###
-        aws_bedrock_runtime_endpoint = optional_params.pop(
+        aws_bedrock_runtime_endpoint = optional_params.get(
            "aws_bedrock_runtime_endpoint", None
        )  # https://bedrock-runtime.{region_name}.amazonaws.com
        endpoint_url, proxy_endpoint_url = self.get_runtime_endpoint(
@ -129,15 +125,15 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):

        ## CREDENTIALS ##
        # pop aws_secret_access_key, aws_access_key_id, aws_session_token, aws_region_name from kwargs, since completion calls fail with them
-        extra_headers = optional_params.pop("extra_headers", None)
-        aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
-        aws_access_key_id = optional_params.pop("aws_access_key_id", None)
-        aws_session_token = optional_params.pop("aws_session_token", None)
-        aws_role_name = optional_params.pop("aws_role_name", None)
-        aws_session_name = optional_params.pop("aws_session_name", None)
-        aws_profile_name = optional_params.pop("aws_profile_name", None)
-        aws_web_identity_token = optional_params.pop("aws_web_identity_token", None)
-        aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None)
+        extra_headers = optional_params.get("extra_headers", None)
+        aws_secret_access_key = optional_params.get("aws_secret_access_key", None)
+        aws_access_key_id = optional_params.get("aws_access_key_id", None)
+        aws_session_token = optional_params.get("aws_session_token", None)
+        aws_role_name = optional_params.get("aws_role_name", None)
+        aws_session_name = optional_params.get("aws_session_name", None)
+        aws_profile_name = optional_params.get("aws_profile_name", None)
+        aws_web_identity_token = optional_params.get("aws_web_identity_token", None)
+        aws_sts_endpoint = optional_params.get("aws_sts_endpoint", None)
        aws_region_name = self._get_aws_region_name(optional_params)

        credentials: Credentials = self.get_credentials(
@ -171,7 +167,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):

        return dict(request.headers)

-    def transform_request(  # noqa: PLR0915
+    def transform_request(
        self,
        model: str,
        messages: List[AllMessageValues],
@ -194,7 +190,6 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
            for k, v in inference_params.items()
            if k not in self.aws_authentication_params
        }
-        json_schemas: dict = {}
        request_data: dict = {}
        if provider == "cohere":
            if model.startswith("cohere.command-r"):
@ -223,57 +218,21 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
                    )
                request_data = {"prompt": prompt, **inference_params}
        elif provider == "anthropic":
-            if model.startswith("anthropic.claude-3"):
-                # Separate system prompt from rest of message
-                system_prompt_idx: list[int] = []
-                system_messages: list[str] = []
-                for idx, message in enumerate(messages):
-                    if message["role"] == "system" and isinstance(
-                        message["content"], str
-                    ):
-                        system_messages.append(message["content"])
-                        system_prompt_idx.append(idx)
-                if len(system_prompt_idx) > 0:
-                    inference_params["system"] = "\n".join(system_messages)
-                    messages = [
-                        i for j, i in enumerate(messages) if j not in system_prompt_idx
-                    ]
-                # Format rest of message according to anthropic guidelines
-                messages = prompt_factory(
-                    model=model, messages=messages, custom_llm_provider="anthropic_xml"
-                )  # type: ignore
-                ## LOAD CONFIG
-                config = litellm.AmazonAnthropicClaude3Config.get_config()
-                for k, v in config.items():
-                    if (
-                        k not in inference_params
-                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                        inference_params[k] = v
-                ## Handle Tool Calling
-                if "tools" in inference_params:
-                    _is_function_call = True
-                    for tool in inference_params["tools"]:
-                        json_schemas[tool["function"]["name"]] = tool["function"].get(
-                            "parameters", None
-                        )
-                    tool_calling_system_prompt = construct_tool_use_system_prompt(
-                        tools=inference_params["tools"]
-                    )
-                    inference_params["system"] = (
-                        inference_params.get("system", "\n")
-                        + tool_calling_system_prompt
-                    )  # add the anthropic tool calling prompt to the system prompt
-                    inference_params.pop("tools")
-                request_data = {"messages": messages, **inference_params}
-            else:
-                ## LOAD CONFIG
-                config = litellm.AmazonAnthropicConfig.get_config()
-                for k, v in config.items():
-                    if (
-                        k not in inference_params
-                    ):  # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
-                        inference_params[k] = v
-                request_data = {"prompt": prompt, **inference_params}
+            return litellm.AmazonAnthropicClaude3Config().transform_request(
+                model=model,
+                messages=messages,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                headers=headers,
+            )
+        elif provider == "nova":
+            return litellm.AmazonInvokeNovaConfig().transform_request(
+                model=model,
+                messages=messages,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                headers=headers,
+            )
        elif provider == "ai21":
            ## LOAD CONFIG
            config = litellm.AmazonAI21Config.get_config()
@ -347,6 +306,10 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
            raise BedrockError(
                message=raw_response.text, status_code=raw_response.status_code
            )
+        verbose_logger.debug(
+            "bedrock invoke response % s",
+            json.dumps(completion_response, indent=4, default=str),
+        )
        provider = self.get_bedrock_invoke_provider(model)
        outputText: Optional[str] = None
        try:
@ -359,66 +322,31 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
                        completion_response["generations"][0]["finish_reason"]
                    )
            elif provider == "anthropic":
-                if model.startswith("anthropic.claude-3"):
-                    json_schemas: dict = {}
-                    _is_function_call = False
-                    ## Handle Tool Calling
-                    if "tools" in optional_params:
-                        _is_function_call = True
-                        for tool in optional_params["tools"]:
-                            json_schemas[tool["function"]["name"]] = tool[
-                                "function"
-                            ].get("parameters", None)
-                    outputText = completion_response.get("content")[0].get("text", None)
-                    if outputText is not None and contains_tag(
-                        "invoke", outputText
-                    ):  # OUTPUT PARSE FUNCTION CALL
-                        function_name = extract_between_tags("tool_name", outputText)[0]
-                        function_arguments_str = extract_between_tags(
-                            "invoke", outputText
-                        )[0].strip()
-                        function_arguments_str = (
-                            f"<invoke>{function_arguments_str}</invoke>"
-                        )
-                        function_arguments = parse_xml_params(
-                            function_arguments_str,
-                            json_schema=json_schemas.get(
-                                function_name, None
-                            ),  # check if we have a json schema for this function name)
-                        )
-                        _message = litellm.Message(
-                            tool_calls=[
-                                {
-                                    "id": f"call_{uuid.uuid4()}",
-                                    "type": "function",
-                                    "function": {
-                                        "name": function_name,
-                                        "arguments": json.dumps(function_arguments),
-                                    },
-                                }
-                            ],
-                            content=None,
-                        )
-                        model_response.choices[0].message = _message  # type: ignore
-                        model_response._hidden_params["original_response"] = (
-                            outputText  # allow user to access raw anthropic tool calling response
-                        )
-                    model_response.choices[0].finish_reason = map_finish_reason(
-                        completion_response.get("stop_reason", "")
-                    )
-                    _usage = litellm.Usage(
-                        prompt_tokens=completion_response["usage"]["input_tokens"],
-                        completion_tokens=completion_response["usage"]["output_tokens"],
-                        total_tokens=completion_response["usage"]["input_tokens"]
-                        + completion_response["usage"]["output_tokens"],
-                    )
-                    setattr(model_response, "usage", _usage)
-                else:
-                    outputText = completion_response["completion"]
-
-                    model_response.choices[0].finish_reason = completion_response[
-                        "stop_reason"
-                    ]
+                return litellm.AmazonAnthropicClaude3Config().transform_response(
+                    model=model,
+                    raw_response=raw_response,
+                    model_response=model_response,
+                    logging_obj=logging_obj,
+                    request_data=request_data,
+                    messages=messages,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    encoding=encoding,
+                    api_key=api_key,
+                    json_mode=json_mode,
+                )
+            elif provider == "nova":
+                return litellm.AmazonInvokeNovaConfig().transform_response(
+                    model=model,
+                    raw_response=raw_response,
+                    model_response=model_response,
+                    logging_obj=logging_obj,
+                    request_data=request_data,
+                    messages=messages,
+                    optional_params=optional_params,
+                    litellm_params=litellm_params,
+                    encoding=encoding,
+                )
            elif provider == "ai21":
                outputText = (
                    completion_response.get("completions")[0].get("data").get("text")
@ -536,6 +464,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
                messages=messages,
                logging_obj=logging_obj,
                fake_stream=True if "ai21" in api_base else False,
+                bedrock_invoke_provider=self.get_bedrock_invoke_provider(model),
            ),
            model=model,
            custom_llm_provider="bedrock",
@ -569,6 +498,7 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
                messages=messages,
                logging_obj=logging_obj,
                fake_stream=True if "ai21" in api_base else False,
+                bedrock_invoke_provider=self.get_bedrock_invoke_provider(model),
            ),
            model=model,
            custom_llm_provider="bedrock",
@ -594,10 +524,15 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
        """
        Helper function to get the bedrock provider from the model

-        handles 2 scenarions:
-        1. model=anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
-        2. model=llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n -> Returns `llama`
+        handles 3 scenarions:
+        1. model=invoke/anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
+        2. model=anthropic.claude-3-5-sonnet-20240620-v1:0 -> Returns `anthropic`
+        3. model=llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n -> Returns `llama`
+        4. model=us.amazon.nova-pro-v1:0 -> Returns `nova`
        """
+        if model.startswith("invoke/"):
+            model = model.replace("invoke/", "", 1)
+
        _split_model = model.split(".")[0]
        if _split_model in get_args(litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL):
            return cast(litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL, _split_model)
@ -606,6 +541,10 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
        provider = AmazonInvokeConfig._get_provider_from_model_path(model)
        if provider is not None:
            return provider
+
+        # check if provider == "nova"
+        if "nova" in model:
+            return "nova"
        return None

    @staticmethod
@ -640,16 +579,16 @@ class AmazonInvokeConfig(BaseConfig, BaseAWSLLM):
        else:
            modelId = model

+        modelId = modelId.replace("invoke/", "", 1)
        if provider == "llama" and "llama/" in modelId:
            modelId = self._get_model_id_for_llama_like_model(modelId)
-
        return modelId

    def _get_aws_region_name(self, optional_params: dict) -> str:
        """
        Get the AWS region name from the environment variables
        """
-        aws_region_name = optional_params.pop("aws_region_name", None)
+        aws_region_name = optional_params.get("aws_region_name", None)
        ### SET REGION NAME ###
        if aws_region_name is None:
            # check env #
--- a/litellm/llms/bedrock/common_utils.py
+++ b/litellm/llms/bedrock/common_utils.py
@ -3,11 +3,12 @@ Common utilities used across bedrock chat/embedding/image generation
 """

 import os
-from typing import List, Optional, Union
+from typing import List, Literal, Optional, Union

 import httpx

 import litellm
+from litellm.llms.base_llm.base_utils import BaseLLMModelInfo
 from litellm.llms.base_llm.chat.transformation import BaseLLMException
 from litellm.secret_managers.main import get_secret

@ -310,3 +311,68 @@ def get_bedrock_tool_name(response_tool_name: str) -> str:
            response_tool_name
        ]
    return response_tool_name
+
+
+class BedrockModelInfo(BaseLLMModelInfo):
+
+    global_config = AmazonBedrockGlobalConfig()
+    all_global_regions = global_config.get_all_regions()
+
+    @staticmethod
+    def get_base_model(model: str) -> str:
+        """
+        Get the base model from the given model name.
+
+        Handle model names like - "us.meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
+        AND "meta.llama3-2-11b-instruct-v1:0" -> "meta.llama3-2-11b-instruct-v1"
+        """
+        if model.startswith("bedrock/"):
+            model = model.split("/", 1)[1]
+
+        if model.startswith("converse/"):
+            model = model.split("/", 1)[1]
+
+        if model.startswith("invoke/"):
+            model = model.split("/", 1)[1]
+
+        potential_region = model.split(".", 1)[0]
+
+        alt_potential_region = model.split("/", 1)[
+            0
+        ]  # in model cost map we store regional information like `/us-west-2/bedrock-model`
+
+        if (
+            potential_region
+            in BedrockModelInfo._supported_cross_region_inference_region()
+        ):
+            return model.split(".", 1)[1]
+        elif (
+            alt_potential_region in BedrockModelInfo.all_global_regions
+            and len(model.split("/", 1)) > 1
+        ):
+            return model.split("/", 1)[1]
+
+        return model
+
+    @staticmethod
+    def _supported_cross_region_inference_region() -> List[str]:
+        """
+        Abbreviations of regions AWS Bedrock supports for cross region inference
+        """
+        return ["us", "eu", "apac"]
+
+    @staticmethod
+    def get_bedrock_route(model: str) -> Literal["converse", "invoke", "converse_like"]:
+        """
+        Get the bedrock route for the given model.
+        """
+        base_model = BedrockModelInfo.get_base_model(model)
+        if "invoke/" in model:
+            return "invoke"
+        elif "converse_like" in model:
+            return "converse_like"
+        elif "converse/" in model:
+            return "converse"
+        elif base_model in litellm.bedrock_converse_models:
+            return "converse"
+        return "invoke"
--- a/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py
+++ b/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py
@ -27,7 +27,7 @@ class AmazonTitanG1Config:
    def __init__(
        self,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py
+++ b/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py
@ -33,7 +33,7 @@ class AmazonTitanV2Config:
    def __init__(
        self, normalize: Optional[bool] = None, dimensions: Optional[int] = None
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/bedrock/image/amazon_stability1_transformation.py
+++ b/litellm/llms/bedrock/image/amazon_stability1_transformation.py
@ -49,7 +49,7 @@ class AmazonStabilityConfig:
        width: Optional[int] = None,
        height: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/clarifai/chat/transformation.py
+++ b/litellm/llms/clarifai/chat/transformation.py
@ -45,7 +45,7 @@ class ClarifaiConfig(BaseConfig):
        temperature: Optional[int] = None,
        top_k: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/cloudflare/chat/transformation.py
+++ b/litellm/llms/cloudflare/chat/transformation.py
@ -44,7 +44,7 @@ class CloudflareChatConfig(BaseConfig):
        max_tokens: Optional[int] = None,
        stream: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/cohere/chat/transformation.py
+++ b/litellm/llms/cohere/chat/transformation.py
@ -104,7 +104,7 @@ class CohereChatConfig(BaseConfig):
        tool_results: Optional[list] = None,
        seed: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/cohere/completion/transformation.py
+++ b/litellm/llms/cohere/completion/transformation.py
@ -86,7 +86,7 @@ class CohereTextConfig(BaseConfig):
        return_likelihoods: Optional[str] = None,
        logit_bias: Optional[dict] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -1,5 +1,6 @@
 import asyncio
 import os
+import time
 from typing import TYPE_CHECKING, Any, Callable, List, Mapping, Optional, Union

 import httpx
@ -179,6 +180,7 @@ class AsyncHTTPHandler:
        stream: bool = False,
        logging_obj: Optional[LiteLLMLoggingObject] = None,
    ):
+        start_time = time.time()
        try:
            if timeout is None:
                timeout = self.timeout
@ -207,6 +209,8 @@ class AsyncHTTPHandler:
            finally:
                await new_client.aclose()
        except httpx.TimeoutException as e:
+            end_time = time.time()
+            time_delta = round(end_time - start_time, 3)
            headers = {}
            error_response = getattr(e, "response", None)
            if error_response is not None:
@ -214,7 +218,7 @@ class AsyncHTTPHandler:
                    headers["response_headers-{}".format(key)] = value

            raise litellm.Timeout(
-                message=f"Connection timed out after {timeout} seconds.",
+                message=f"Connection timed out. Timeout passed={timeout}, time taken={time_delta} seconds",
                model="default-model-name",
                llm_provider="litellm-httpx-handler",
                headers=headers,
--- a/litellm/llms/databricks/chat/transformation.py
+++ b/litellm/llms/databricks/chat/transformation.py
@ -37,7 +37,7 @@ class DatabricksConfig(OpenAILikeChatConfig):
        stop: Optional[Union[List[str], str]] = None,
        n: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/databricks/embed/transformation.py
+++ b/litellm/llms/databricks/embed/transformation.py
@ -16,7 +16,7 @@ class DatabricksEmbeddingConfig:
    )

    def __init__(self, instruction: Optional[str] = None) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/deprecated_providers/aleph_alpha.py
+++ b/litellm/llms/deprecated_providers/aleph_alpha.py
@ -145,7 +145,7 @@ class AlephAlphaConfig:
        contextual_control_threshold: Optional[int] = None,
        control_log_additive: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/deprecated_providers/palm.py
+++ b/litellm/llms/deprecated_providers/palm.py
@ -63,7 +63,7 @@ class PalmConfig:
        top_p: Optional[float] = None,
        max_output_tokens: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/gemini/chat/transformation.py
+++ b/litellm/llms/gemini/chat/transformation.py
@ -57,7 +57,7 @@ class GoogleAIStudioGeminiConfig(VertexGeminiConfig):
        candidate_count: Optional[int] = None,
        stop_sequences: Optional[list] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/huggingface/chat/transformation.py
+++ b/litellm/llms/huggingface/chat/transformation.py
@ -77,7 +77,7 @@ class HuggingfaceChatConfig(BaseConfig):
        typical_p: Optional[float] = None,
        watermark: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/infinity/rerank/transformation.py
+++ b/litellm/llms/infinity/rerank/transformation.py
@ -20,6 +20,15 @@ from .common_utils import InfinityError


 class InfinityRerankConfig(CohereRerankConfig):
+    def get_complete_url(self, api_base: Optional[str], model: str) -> str:
+        if api_base is None:
+            raise ValueError("api_base is required for Infinity rerank")
+        # Remove trailing slashes and ensure clean base URL
+        api_base = api_base.rstrip("/")
+        if not api_base.endswith("/rerank"):
+            api_base = f"{api_base}/rerank"
+        return api_base
+
    def validate_environment(
        self,
        headers: dict,
--- a/litellm/llms/jina_ai/embedding/transformation.py
+++ b/litellm/llms/jina_ai/embedding/transformation.py
@ -21,7 +21,7 @@ class JinaAIEmbeddingConfig:
    def __init__(
        self,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/lm_studio/embed/transformation.py
+++ b/litellm/llms/lm_studio/embed/transformation.py
@ -18,7 +18,7 @@ class LmStudioEmbeddingConfig:
    def __init__(
        self,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/maritalk.py
+++ b/litellm/llms/maritalk.py
@ -33,7 +33,7 @@ class MaritalkConfig(OpenAIGPTConfig):
        tools: Optional[List[dict]] = None,
        tool_choice: Optional[Union[str, dict]] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/nlp_cloud/chat/transformation.py
+++ b/litellm/llms/nlp_cloud/chat/transformation.py
@ -78,7 +78,7 @@ class NLPCloudConfig(BaseConfig):
        num_beams: Optional[int] = None,
        num_return_sequences: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/nvidia_nim/embed.py
+++ b/litellm/llms/nvidia_nim/embed.py
@ -32,7 +32,7 @@ class NvidiaNimEmbeddingConfig:
        input_type: Optional[str] = None,
        truncate: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
@ -58,7 +58,7 @@ class NvidiaNimEmbeddingConfig:
    def get_supported_openai_params(
        self,
    ):
-        return ["encoding_format", "user"]
+        return ["encoding_format", "user", "dimensions"]

    def map_openai_params(
        self,
@ -73,6 +73,8 @@ class NvidiaNimEmbeddingConfig:
                optional_params["extra_body"].update({"input_type": v})
            elif k == "truncate":
                optional_params["extra_body"].update({"truncate": v})
+            else:
+                optional_params[k] = v

        if kwargs is not None:
            # pass kwargs in extra_body
--- a/litellm/llms/ollama/completion/transformation.py
+++ b/litellm/llms/ollama/completion/transformation.py
@ -117,7 +117,7 @@ class OllamaConfig(BaseConfig):
        system: Optional[str] = None,
        template: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -105,7 +105,7 @@ class OllamaChatConfig(OpenAIGPTConfig):
        system: Optional[str] = None,
        template: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/openai/chat/gpt_transformation.py
+++ b/litellm/llms/openai/chat/gpt_transformation.py
@ -344,6 +344,10 @@ class OpenAIGPTConfig(BaseLLMModelInfo, BaseConfig):
            or "https://api.openai.com/v1"
        )

+    @staticmethod
+    def get_base_model(model: str) -> str:
+        return model
+
    def get_model_response_iterator(
        self,
        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
--- a/litellm/llms/openai/chat/o_series_transformation.py
+++ b/litellm/llms/openai/chat/o_series_transformation.py
@ -54,7 +54,7 @@ class OpenAIOSeriesConfig(OpenAIGPTConfig):

        if model is None:
            return True
-        supported_stream_models = ["o1-mini", "o1-preview"]
+        supported_stream_models = ["o1-mini", "o1-preview", "o3-mini"]
        for supported_model in supported_stream_models:
            if supported_model in model:
                return False
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@ -27,6 +27,7 @@ from typing_extensions import overload
 import litellm
 from litellm import LlmProviders
 from litellm._logging import verbose_logger
+from litellm.constants import DEFAULT_MAX_RETRIES
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
@ -320,6 +321,17 @@ class OpenAIChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()

+    def _set_dynamic_params_on_client(
+        self,
+        client: Union[OpenAI, AsyncOpenAI],
+        organization: Optional[str] = None,
+        max_retries: Optional[int] = None,
+    ):
+        if organization is not None:
+            client.organization = organization
+        if max_retries is not None:
+            client.max_retries = max_retries
+
    def _get_openai_client(
        self,
        is_async: bool,
@ -327,11 +339,10 @@ class OpenAIChatCompletion(BaseLLM):
        api_base: Optional[str] = None,
        api_version: Optional[str] = None,
        timeout: Union[float, httpx.Timeout] = httpx.Timeout(None),
-        max_retries: Optional[int] = 2,
+        max_retries: Optional[int] = DEFAULT_MAX_RETRIES,
        organization: Optional[str] = None,
        client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
    ):
-        args = locals()
        if client is None:
            if not isinstance(max_retries, int):
                raise OpenAIError(
@ -364,7 +375,6 @@ class OpenAIChatCompletion(BaseLLM):
                    organization=organization,
                )
            else:
-
                _new_client = OpenAI(
                    api_key=api_key,
                    base_url=api_base,
@ -383,6 +393,11 @@ class OpenAIChatCompletion(BaseLLM):
            return _new_client

        else:
+            self._set_dynamic_params_on_client(
+                client=client,
+                organization=organization,
+                max_retries=max_retries,
+            )
            return client

    @track_llm_api_timing()
--- a/litellm/llms/petals/completion/transformation.py
+++ b/litellm/llms/petals/completion/transformation.py
@ -58,7 +58,7 @@ class PetalsConfig(BaseConfig):
        top_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/predibase/chat/transformation.py
+++ b/litellm/llms/predibase/chat/transformation.py
@ -59,7 +59,7 @@ class PredibaseConfig(BaseConfig):
        typical_p: Optional[float] = None,
        watermark: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/replicate/chat/transformation.py
+++ b/litellm/llms/replicate/chat/transformation.py
@ -73,7 +73,7 @@ class ReplicateConfig(BaseConfig):
        seed: Optional[int] = None,
        debug: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/sagemaker/completion/transformation.py
+++ b/litellm/llms/sagemaker/completion/transformation.py
@ -47,7 +47,7 @@ class SagemakerConfig(BaseConfig):
        temperature: Optional[float] = None,
        return_full_text: Optional[bool] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/topaz/common_utils.py
+++ b/litellm/llms/topaz/common_utils.py
@ -29,3 +29,7 @@ class TopazModelInfo(BaseLLMModelInfo):
        return (
            api_base or get_secret_str("TOPAZ_API_BASE") or "https://api.topazlabs.com"
        )
+
+    @staticmethod
+    def get_base_model(model: str) -> str:
+        return model
--- a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@ -179,7 +179,7 @@ class VertexGeminiConfig(VertexAIBaseConfig, BaseConfig):
        presence_penalty: Optional[float] = None,
        seed: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/vertex_ai/vertex_ai_partner_models/ai21/transformation.py
+++ b/litellm/llms/vertex_ai/vertex_ai_partner_models/ai21/transformation.py
@ -17,7 +17,7 @@ class VertexAIAi21Config:
        self,
        max_tokens: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/vertex_ai/vertex_ai_partner_models/llama3/transformation.py
+++ b/litellm/llms/vertex_ai/vertex_ai_partner_models/llama3/transformation.py
@ -21,7 +21,7 @@ class VertexAILlama3Config:
        self,
        max_tokens: Optional[int] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key == "max_tokens" and value is None:
                value = self.max_tokens
--- a/litellm/llms/vertex_ai/vertex_embeddings/transformation.py
+++ b/litellm/llms/vertex_ai/vertex_embeddings/transformation.py
@ -48,7 +48,7 @@ class VertexAITextEmbeddingConfig(BaseModel):
        ] = None,
        title: Optional[str] = None,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/llms/watsonx/completion/transformation.py
+++ b/litellm/llms/watsonx/completion/transformation.py
@ -108,7 +108,7 @@ class IBMWatsonXAIConfig(IBMWatsonXMixin, BaseConfig):
        stream: Optional[bool] = None,
        **kwargs,
    ) -> None:
-        locals_ = locals()
+        locals_ = locals().copy()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
--- a/litellm/main.py
+++ b/litellm/main.py
@ -68,6 +68,7 @@ from litellm.litellm_core_utils.prompt_templates.common_utils import (
    get_content_from_model_response,
 )
 from litellm.llms.base_llm.chat.transformation import BaseConfig
+from litellm.llms.bedrock.common_utils import BedrockModelInfo
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.realtime_api.main import _realtime_health_check
 from litellm.secret_managers.main import get_secret_str
@ -1222,6 +1223,8 @@ def completion(  # type: ignore # noqa: PLR0915

            if extra_headers is not None:
                optional_params["extra_headers"] = extra_headers
+            if max_retries is not None:
+                optional_params["max_retries"] = max_retries

            if litellm.AzureOpenAIO1Config().is_o_series_model(model=model):

@ -2626,11 +2629,8 @@ def completion(  # type: ignore # noqa: PLR0915
                        aws_bedrock_client.meta.region_name
                    )

-            base_model = litellm.AmazonConverseConfig()._get_base_model(model)
-
-            if base_model in litellm.bedrock_converse_models or model.startswith(
-                "converse/"
-            ):
+            bedrock_route = BedrockModelInfo.get_bedrock_route(model)
+            if bedrock_route == "converse":
                model = model.replace("converse/", "")
                response = bedrock_converse_chat_completion.completion(
                    model=model,
@ -2649,7 +2649,7 @@ def completion(  # type: ignore # noqa: PLR0915
                    client=client,
                    api_base=api_base,
                )
-            elif "converse_like" in model:
+            elif bedrock_route == "converse_like":
                model = model.replace("converse_like/", "")
                response = base_llm_http_handler.completion(
                    model=model,
@ -3947,6 +3947,7 @@ async def atext_completion(
                ),
                model=model,
                custom_llm_provider=custom_llm_provider,
+                stream_options=kwargs.get('stream_options'),
            )
        else:
            ## OpenAI / Azure Text Completion Returns here
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1069,6 +1069,21 @@
        "supports_prompt_caching": true,
        "supports_tool_choice": true
    },
+    "azure/o1-2024-12-17": {
+        "max_tokens": 100000,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 100000,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000060,
+        "cache_read_input_token_cost": 0.0000075,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "supports_vision": true,
+        "supports_prompt_caching": true,
+        "supports_tool_choice": true
+    },
    "azure/o1-preview": {
        "max_tokens": 32768,
        "max_input_tokens": 128000,
@ -1397,6 +1412,19 @@
        "deprecation_date": "2025-03-31",
        "supports_tool_choice": true
    },
+    "azure/gpt-3.5-turbo-0125": {
+        "max_tokens": 4096,
+        "max_input_tokens": 16384,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000005,
+        "output_cost_per_token": 0.0000015,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_parallel_function_calling": true,
+        "deprecation_date": "2025-03-31",
+        "supports_tool_choice": true
+    },
    "azure/gpt-35-turbo-16k": {
        "max_tokens": 4096,
        "max_input_tokens": 16385,
@ -1418,6 +1446,17 @@
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
+    "azure/gpt-3.5-turbo": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4097,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000005,
+        "output_cost_per_token": 0.0000015,
+        "litellm_provider": "azure",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
    "azure/gpt-3.5-turbo-instruct-0914": {
        "max_tokens": 4097,
        "max_input_tokens": 4097,
@ -2174,11 +2213,11 @@
        "max_tokens": 8192,
        "max_input_tokens": 65536,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000014,
-        "input_cost_per_token_cache_hit": 0.000000014,
-        "cache_read_input_token_cost": 0.000000014,
+        "input_cost_per_token": 0.00000027,
+        "input_cost_per_token_cache_hit": 0.00000007,
+        "cache_read_input_token_cost": 0.00000007,
        "cache_creation_input_token_cost": 0.0,
-        "output_cost_per_token": 0.00000028,
+        "output_cost_per_token": 0.0000011,
        "litellm_provider": "deepseek",
        "mode": "chat",
        "supports_function_calling": true, 
@ -3650,9 +3689,34 @@
        "supports_vision": true,
        "supports_response_schema": true,
        "supports_audio_output": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing",
        "supports_tool_choice": true
    },
+    "gemini/gemini-2.0-flash": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 0.0000007,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000004,
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "rpm": 10000,
+        "tpm": 10000000,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": true,
+        "supports_tool_choice": true,
+        "source": "https://ai.google.dev/pricing#2_0flash"
+    },
    "gemini-2.0-flash-001": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
@ -3663,9 +3727,9 @@
        "max_audio_length_hours": 8.4,
        "max_audio_per_prompt": 1,
        "max_pdf_size_mb": 30,
-        "input_cost_per_audio_token": 0.001,
-        "input_cost_per_token": 0.00015,
-        "output_cost_per_token": 0.0006,
+        "input_cost_per_audio_token": 0.000001,
+        "input_cost_per_token": 0.00000015,
+        "output_cost_per_token": 0.0000006,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "supports_system_messages": true,
@ -3674,7 +3738,7 @@
        "supports_response_schema": true,
        "supports_audio_output": true,
        "supports_tool_choice": true,
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash"
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
    },
    "gemini-2.0-flash-thinking-exp": {
        "max_tokens": 8192,
@ -3744,6 +3808,31 @@
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
        "supports_tool_choice": true
    },
+    "gemini/gemini-2.0-flash-001": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 0.0000007,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000004,
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "rpm": 10000,
+        "tpm": 10000000,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": false,
+        "supports_tool_choice": true,
+        "source": "https://ai.google.dev/pricing#2_0flash"
+    },
    "gemini/gemini-2.0-flash-exp": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
@ -3780,6 +3869,31 @@
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash",
        "supports_tool_choice": true
    },
+    "gemini/gemini-2.0-flash-lite-preview-02-05": {
+        "max_tokens": 8192,
+        "max_input_tokens": 1048576,
+        "max_output_tokens": 8192,
+        "max_images_per_prompt": 3000,
+        "max_videos_per_prompt": 10,
+        "max_video_length": 1,
+        "max_audio_length_hours": 8.4,
+        "max_audio_per_prompt": 1,
+        "max_pdf_size_mb": 30,
+        "input_cost_per_audio_token": 0.000000075,
+        "input_cost_per_token": 0.000000075,
+        "output_cost_per_token": 0.0000003,
+        "litellm_provider": "gemini",
+        "mode": "chat",
+        "rpm": 60000,
+        "tpm": 10000000,
+        "supports_system_messages": true,
+        "supports_function_calling": true,
+        "supports_vision": true,
+        "supports_response_schema": true,
+        "supports_audio_output": false,
+        "supports_tool_choice": true,
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#gemini-2.0-flash-lite"
+    },
    "gemini/gemini-2.0-flash-thinking-exp": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
@ -6026,7 +6140,8 @@
        "litellm_provider": "bedrock_converse",
        "mode": "chat",
        "supports_function_calling": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_response_schema": true
    },
    "us.amazon.nova-micro-v1:0": {
        "max_tokens": 4096, 
@ -6051,7 +6166,8 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_pdf_input": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_response_schema": true
    },
    "us.amazon.nova-lite-v1:0": {
        "max_tokens": 4096, 
@ -6064,7 +6180,8 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_pdf_input": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_response_schema": true
    },
    "amazon.nova-pro-v1:0": {
        "max_tokens": 4096, 
@ -6077,7 +6194,8 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_pdf_input": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_response_schema": true
    },
    "us.amazon.nova-pro-v1:0": {
        "max_tokens": 4096, 
@ -6090,7 +6208,8 @@
        "supports_function_calling": true,
        "supports_vision": true,
        "supports_pdf_input": true,
-        "supports_prompt_caching": true
+        "supports_prompt_caching": true,
+        "supports_response_schema": true
    },
    "anthropic.claude-3-sonnet-20240229-v1:0": {
        "max_tokens": 4096, 
@ -6101,6 +6220,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6113,6 +6233,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6140,6 +6261,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6153,6 +6275,7 @@
        "mode": "chat",
        "supports_assistant_prefill": true,
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_prompt_caching": true,
        "supports_tool_choice": true
    },
@ -6165,6 +6288,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6177,6 +6301,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6189,6 +6314,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6216,6 +6342,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6230,6 +6357,7 @@
        "supports_assistant_prefill": true,
        "supports_function_calling": true,
        "supports_prompt_caching": true,
+        "supports_response_schema": true,
        "supports_tool_choice": true
    },
    "us.anthropic.claude-3-opus-20240229-v1:0": {
@ -6241,6 +6369,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6253,6 +6382,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6265,6 +6395,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6292,6 +6423,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -6318,6 +6450,7 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true,
+        "supports_response_schema": true,
        "supports_vision": true,
        "supports_tool_choice": true
    },
@ -8935,4 +9068,4 @@
        "output_cost_per_second": 0.00, 
        "litellm_provider": "assemblyai"
    }
-}
+}
--- a/litellm/proxy/_experimental/out/_next/static/chunks/225-72bee079fe8c7963.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/225-72bee079fe8c7963.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/250-57462126852c026f.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/250-57462126852c026f.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/250-9fe99e6518e0255a.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/250-9fe99e6518e0255a.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/261-c5b2a5c7eb59699f.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/261-c5b2a5c7eb59699f.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/309-a4a536a89a6eb4e7.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/309-a4a536a89a6eb4e7.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/699-0fa813304e163f40.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/699-0fa813304e163f40.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/onboarding/page-c062a46fc186ad04.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/onboarding/page-c062a46fc186ad04.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0ce71a56a507c704.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-0ce71a56a507c704.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-cce31e700e2d30b6.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-cce31e700e2d30b6.js
--- a/litellm/proxy/_experimental/out/_next/static/css/17e707d690fab559.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/17e707d690fab559.css
--- a/litellm/proxy/_experimental/out/_next/static/css/6b2cc61767854ec6.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/6b2cc61767854ec6.css
--- a/litellm/proxy/_experimental/out/_next/static/u7jIPHZY9RhOYpS_V7EDA/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/u7jIPHZY9RhOYpS_V7EDA/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/u7jIPHZY9RhOYpS_V7EDA/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/u7jIPHZY9RhOYpS_V7EDA/_ssgManifest.js
--- a/Show more
+++ b/Show more